Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 15:20:36 -0700
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4
download: linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz
linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2
linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip
155 files changed, 82733 insertions, 0 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
new file mode 100644
index 000000000000..6d3e8b1bd1f2
--- /dev/null
+++ b/net/ipv4/Kconfig
@@ -0,0 +1,411 @@
+#
+# IP configuration
+#
+config IP_MULTICAST
+	bool "IP: multicasting"
+	depends on INET
+	help
+	  This is code for addressing several networked computers at once,
+	  enlarging your kernel by about 2 KB. You need multicasting if you
+	  intend to participate in the MBONE, a high bandwidth network on top
+	  of the Internet which carries audio and video broadcasts. More
+	  information about the MBONE is on the WWW at
+	  <http://www-itg.lbl.gov/mbone/>. Information about the multicast
+	  capabilities of the various network cards is contained in
+	  <file:Documentation/networking/multicast.txt>. For most people, it's
+	  safe to say N.
+
+config IP_ADVANCED_ROUTER
+	bool "IP: advanced router"
+	depends on INET
+	---help---
+	  If you intend to run your Linux box mostly as a router, i.e. as a
+	  computer that forwards and redistributes network packets, say Y; you
+	  will then be presented with several options that allow more precise
+	  control about the routing process.
+
+	  The answer to this question won't directly affect the kernel:
+	  answering N will just cause the configurator to skip all the
+	  questions about advanced routing.
+
+	  Note that your box can only act as a router if you enable IP
+	  forwarding in your kernel; you can do that by saying Y to "/proc
+	  file system support" and "Sysctl support" below and executing the
+	  line
+
+	  echo "1" > /proc/sys/net/ipv4/ip_forward
+
+	  at boot time after the /proc file system has been mounted.
+
+	  If you turn on IP forwarding, you will also get the rp_filter, which
+	  automatically rejects incoming packets if the routing table entry
+	  for their source address doesn't match the network interface they're
+	  arriving on. This has security advantages because it prevents the
+	  so-called IP spoofing, however it can pose problems if you use
+	  asymmetric routing (packets from you to a host take a different path
+	  than packets from that host to you) or if you operate a non-routing
+	  host which has several IP addresses on different interfaces. To turn
+	  rp_filter off use:
+
+	  echo 0 > /proc/sys/net/ipv4/conf/<device>/rp_filter
+	  or
+	  echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter
+
+	  If unsure, say N here.
+
+config IP_MULTIPLE_TABLES
+	bool "IP: policy routing"
+	depends on IP_ADVANCED_ROUTER
+	---help---
+	  Normally, a router decides what to do with a received packet based
+	  solely on the packet's final destination address. If you say Y here,
+	  the Linux router will also be able to take the packet's source
+	  address into account. Furthermore, the TOS (Type-Of-Service) field
+	  of the packet can be used for routing decisions as well.
+
+	  If you are interested in this, please see the preliminary
+	  documentation at <http://www.compendium.com.ar/policy-routing.txt>
+	  and <ftp://post.tepkom.ru/pub/vol2/Linux/docs/advanced-routing.tex>.
+	  You will need supporting software from
+	  <ftp://ftp.tux.org/pub/net/ip-routing/>.
+
+	  If unsure, say N.
+
+config IP_ROUTE_FWMARK
+	bool "IP: use netfilter MARK value as routing key"
+	depends on IP_MULTIPLE_TABLES && NETFILTER
+	help
+	  If you say Y here, you will be able to specify different routes for
+	  packets with different mark values (see iptables(8), MARK target).
+
+config IP_ROUTE_MULTIPATH
+	bool "IP: equal cost multipath"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  Normally, the routing tables specify a single action to be taken in
+	  a deterministic manner for a given packet. If you say Y here
+	  however, it becomes possible to attach several actions to a packet
+	  pattern, in effect specifying several alternative paths to travel
+	  for those packets. The router considers all these paths to be of
+	  equal "cost" and chooses one of them in a non-deterministic fashion
+	  if a matching packet arrives.
+
+config IP_ROUTE_MULTIPATH_CACHED
+	bool "IP: equal cost multipath with caching support (EXPERIMENTAL)"
+	depends on: IP_ROUTE_MULTIPATH
+	help
+	  Normally, equal cost multipath routing is not supported by the
+	  routing cache. If you say Y here, alternative routes are cached
+	  and on cache lookup a route is chosen in a configurable fashion.
+
+	  If unsure, say N.
+
+config IP_ROUTE_MULTIPATH_RR
+	tristate "MULTIPATH: round robin algorithm"
+	depends on IP_ROUTE_MULTIPATH_CACHED
+	help
+	  Mulitpath routes are chosen according to Round Robin
+
+config IP_ROUTE_MULTIPATH_RANDOM
+	tristate "MULTIPATH: random algorithm"
+	depends on IP_ROUTE_MULTIPATH_CACHED
+	help
+	  Multipath routes are chosen in a random fashion. Actually,
+	  there is no weight for a route. The advantage of this policy
+	  is that it is implemented stateless and therefore introduces only
+	  a very small delay.
+
+config IP_ROUTE_MULTIPATH_WRANDOM
+	tristate "MULTIPATH: weighted random algorithm"
+	depends on IP_ROUTE_MULTIPATH_CACHED
+	help
+	  Multipath routes are chosen in a weighted random fashion. 
+	  The per route weights are the weights visible via ip route 2. As the
+	  corresponding state management introduces some overhead routing delay
+	  is increased.
+
+config IP_ROUTE_MULTIPATH_DRR
+	tristate "MULTIPATH: interface round robin algorithm"
+	depends on IP_ROUTE_MULTIPATH_CACHED
+	help
+	  Connections are distributed in a round robin fashion over the
+	  available interfaces. This policy makes sense if the connections 
+	  should be primarily distributed on interfaces and not on routes. 
+
+config IP_ROUTE_VERBOSE
+	bool "IP: verbose route monitoring"
+	depends on IP_ADVANCED_ROUTER
+	help
+	  If you say Y here, which is recommended, then the kernel will print
+	  verbose messages regarding the routing, for example warnings about
+	  received packets which look strange and could be evidence of an
+	  attack or a misconfigured system somewhere. The information is
+	  handled by the klogd daemon which is responsible for kernel messages
+	  ("man klogd").
+
+config IP_PNP
+	bool "IP: kernel level autoconfiguration"
+	depends on INET
+	help
+	  This enables automatic configuration of IP addresses of devices and
+	  of the routing table during kernel boot, based on either information
+	  supplied on the kernel command line or by BOOTP or RARP protocols.
+	  You need to say Y only for diskless machines requiring network
+	  access to boot (in which case you want to say Y to "Root file system
+	  on NFS" as well), because all other machines configure the network
+	  in their startup scripts.
+
+config IP_PNP_DHCP
+	bool "IP: DHCP support"
+	depends on IP_PNP
+	---help---
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the DHCP protocol (a
+	  special protocol designed for doing this job), say Y here. In case
+	  the boot ROM of your network card was designed for booting Linux and
+	  does DHCP itself, providing all necessary information on the kernel
+	  command line, you can say N here.
+
+	  If unsure, say Y. Note that if you want to use DHCP, a DHCP server
+	  must be operating on your network.  Read
+	  <file:Documentation/nfsroot.txt> for details.
+
+config IP_PNP_BOOTP
+	bool "IP: BOOTP support"
+	depends on IP_PNP
+	---help---
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the BOOTP protocol (a
+	  special protocol designed for doing this job), say Y here. In case
+	  the boot ROM of your network card was designed for booting Linux and
+	  does BOOTP itself, providing all necessary information on the kernel
+	  command line, you can say N here. If unsure, say Y. Note that if you
+	  want to use BOOTP, a BOOTP server must be operating on your network.
+	  Read <file:Documentation/nfsroot.txt> for details.
+
+config IP_PNP_RARP
+	bool "IP: RARP support"
+	depends on IP_PNP
+	help
+	  If you want your Linux box to mount its whole root file system (the
+	  one containing the directory /) from some other computer over the
+	  net via NFS and you want the IP address of your computer to be
+	  discovered automatically at boot time using the RARP protocol (an
+	  older protocol which is being obsoleted by BOOTP and DHCP), say Y
+	  here. Note that if you want to use RARP, a RARP server must be
+	  operating on your network. Read <file:Documentation/nfsroot.txt> for
+	  details.
+
+# not yet ready..
+#   bool '    IP: ARP support' CONFIG_IP_PNP_ARP		
+config NET_IPIP
+	tristate "IP: tunneling"
+	depends on INET
+	select INET_TUNNEL
+	---help---
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This particular tunneling driver implements
+	  encapsulation of IP within IP, which sounds kind of pointless, but
+	  can be useful if you want to make your (or some other) machine
+	  appear on a different network than it physically is, or to use
+	  mobile-IP facilities (allowing laptops to seamlessly move between
+	  networks without changing their IP addresses).
+
+	  Saying Y to this option will produce two modules ( = code which can
+	  be inserted in and removed from the running kernel whenever you
+	  want). Most people won't need this and can say N.
+
+config NET_IPGRE
+	tristate "IP: GRE tunnels over IP"
+	depends on INET
+	select XFRM
+	help
+	  Tunneling means encapsulating data of one protocol type within
+	  another protocol and sending it over a channel that understands the
+	  encapsulating protocol. This particular tunneling driver implements
+	  GRE (Generic Routing Encapsulation) and at this time allows
+	  encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure.
+	  This driver is useful if the other endpoint is a Cisco router: Cisco
+	  likes GRE much better than the other Linux tunneling driver ("IP
+	  tunneling" above). In addition, GRE allows multicast redistribution
+	  through the tunnel.
+
+config NET_IPGRE_BROADCAST
+	bool "IP: broadcast GRE over IP"
+	depends on IP_MULTICAST && NET_IPGRE
+	help
+	  One application of GRE/IP is to construct a broadcast WAN (Wide Area
+	  Network), which looks like a normal Ethernet LAN (Local Area
+	  Network), but can be distributed all over the Internet. If you want
+	  to do that, say Y here and to "IP multicast routing" below.
+
+config IP_MROUTE
+	bool "IP: multicast routing"
+	depends on IP_MULTICAST
+	help
+	  This is used if you want your machine to act as a router for IP
+	  packets that have several destination addresses. It is needed on the
+	  MBONE, a high bandwidth network on top of the Internet which carries
+	  audio and video broadcasts. In order to do that, you would most
+	  likely run the program mrouted. Information about the multicast
+	  capabilities of the various network cards is contained in
+	  <file:Documentation/networking/multicast.txt>. If you haven't heard
+	  about it, you don't need it.
+
+config IP_PIMSM_V1
+	bool "IP: PIM-SM version 1 support"
+	depends on IP_MROUTE
+	help
+	  Kernel side support for Sparse Mode PIM (Protocol Independent
+	  Multicast) version 1. This multicast routing protocol is used widely
+	  because Cisco supports it. You need special software to use it
+	  (pimd-v1). Please see <http://netweb.usc.edu/pim/> for more
+	  information about PIM.
+
+	  Say Y if you want to use PIM-SM v1. Note that you can say N here if
+	  you just want to use Dense Mode PIM.
+
+config IP_PIMSM_V2
+	bool "IP: PIM-SM version 2 support"
+	depends on IP_MROUTE
+	help
+	  Kernel side support for Sparse Mode PIM version 2. In order to use
+	  this, you need an experimental routing daemon supporting it (pimd or
+	  gated-5). This routing protocol is not used widely, so say N unless
+	  you want to play with it.
+
+config ARPD
+	bool "IP: ARP daemon support (EXPERIMENTAL)"
+	depends on INET && EXPERIMENTAL
+	---help---
+	  Normally, the kernel maintains an internal cache which maps IP
+	  addresses to hardware addresses on the local network, so that
+	  Ethernet/Token Ring/ etc. frames are sent to the proper address on
+	  the physical networking layer. For small networks having a few
+	  hundred directly connected hosts or less, keeping this address
+	  resolution (ARP) cache inside the kernel works well. However,
+	  maintaining an internal ARP cache does not work well for very large
+	  switched networks, and will use a lot of kernel memory if TCP/IP
+	  connections are made to many machines on the network.
+
+	  If you say Y here, the kernel's internal ARP cache will never grow
+	  to more than 256 entries (the oldest entries are expired in a LIFO
+	  manner) and communication will be attempted with the user space ARP
+	  daemon arpd. Arpd then answers the address resolution request either
+	  from its own cache or by asking the net.
+
+	  This code is experimental and also obsolete. If you want to use it,
+	  you need to find a version of the daemon arpd on the net somewhere,
+	  and you should also say Y to "Kernel/User network link driver",
+	  below. If unsure, say N.
+
+config SYN_COOKIES
+	bool "IP: TCP syncookie support (disabled per default)"
+	depends on INET
+	---help---
+	  Normal TCP/IP networking is open to an attack known as "SYN
+	  flooding". This denial-of-service attack prevents legitimate remote
+	  users from being able to connect to your computer during an ongoing
+	  attack and requires very little work from the attacker, who can
+	  operate from anywhere on the Internet.
+
+	  SYN cookies provide protection against this type of attack. If you
+	  say Y here, the TCP/IP stack will use a cryptographic challenge
+	  protocol known as "SYN cookies" to enable legitimate users to
+	  continue to connect, even when your machine is under attack. There
+	  is no need for the legitimate users to change their TCP/IP software;
+	  SYN cookies work transparently to them. For technical information
+	  about SYN cookies, check out <http://cr.yp.to/syncookies.html>.
+
+	  If you are SYN flooded, the source address reported by the kernel is
+	  likely to have been forged by the attacker; it is only reported as
+	  an aid in tracing the packets to their actual source and should not
+	  be taken as absolute truth.
+
+	  SYN cookies may prevent correct error reporting on clients when the
+	  server is really overloaded. If this happens frequently better turn
+	  them off.
+
+	  If you say Y here, note that SYN cookies aren't enabled by default;
+	  you can enable them by saying Y to "/proc file system support" and
+	  "Sysctl support" below and executing the command
+
+	  echo 1 >/proc/sys/net/ipv4/tcp_syncookies
+
+	  at boot time after the /proc file system has been mounted.
+
+	  If unsure, say N.
+
+config INET_AH
+	tristate "IP: AH transformation"
+	depends on INET
+	select XFRM
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	---help---
+	  Support for IPsec AH.
+
+	  If unsure, say Y.
+
+config INET_ESP
+	tristate "IP: ESP transformation"
+	depends on INET
+	select XFRM
+	select CRYPTO
+	select CRYPTO_HMAC
+	select CRYPTO_MD5
+	select CRYPTO_SHA1
+	select CRYPTO_DES
+	---help---
+	  Support for IPsec ESP.
+
+	  If unsure, say Y.
+
+config INET_IPCOMP
+	tristate "IP: IPComp transformation"
+	depends on INET
+	select XFRM
+	select INET_TUNNEL
+	select CRYPTO
+	select CRYPTO_DEFLATE
+	---help---
+	  Support for IP Payload Compression Protocol (IPComp) (RFC3173),
+	  typically needed for IPsec.
+	  
+	  If unsure, say Y.
+
+config INET_TUNNEL
+	tristate "IP: tunnel transformation"
+	depends on INET
+	select XFRM
+	---help---
+	  Support for generic IP tunnel transformation, which is required by
+	  the IP tunneling module as well as tunnel mode IPComp.
+	  
+	  If unsure, say Y.
+
+config IP_TCPDIAG
+	tristate "IP: TCP socket monitoring interface"
+	depends on INET
+	default y
+	---help---
+	  Support for TCP socket monitoring interface used by native Linux
+	  tools such as ss. ss is included in iproute2, currently downloadable
+	  at <http://developer.osdl.org/dev/iproute2>. If you want IPv6 support
+	  and have selected IPv6 as a module, you need to build this as a
+	  module too.
+	  
+	  If unsure, say Y.
+
+config IP_TCPDIAG_IPV6
+	def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6)
+
+source "net/ipv4/ipvs/Kconfig"
+
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
new file mode 100644
index 000000000000..8b379627ebb6
--- /dev/null
+++ b/net/ipv4/Makefile
@@ -0,0 +1,33 @@
+#
+# Makefile for the Linux TCP/IP (INET) layer.
+#
+
+obj-y     := utils.o route.o inetpeer.o protocol.o \
+	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
+	     ip_output.o ip_sockglue.o \
+	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \
+	     datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \
+	     sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o
+
+obj-$(CONFIG_PROC_FS) += proc.o
+obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
+obj-$(CONFIG_IP_MROUTE) += ipmr.o
+obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE) += ip_gre.o
+obj-$(CONFIG_SYN_COOKIES) += syncookies.o
+obj-$(CONFIG_INET_AH) += ah4.o
+obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
+obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o 
+obj-$(CONFIG_IP_PNP) += ipconfig.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o
+obj-$(CONFIG_NETFILTER)	+= netfilter/
+obj-$(CONFIG_IP_VS) += ipvs/
+obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o 
+obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o
+
+obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
+		      xfrm4_output.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
new file mode 100644
index 000000000000..c34dab67e461
--- /dev/null
+++ b/net/ipv4/af_inet.c
@@ -0,0 +1,1188 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		PF_INET protocol family socket handler.
+ *
+ * Version:	$Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Alan Cox, <A.Cox@swansea.ac.uk>
+ *
+ * Changes (see also sock.c)
+ *
+ *		piggy,
+ *		Karl Knutson	:	Socket protocol table
+ *		A.N.Kuznetsov	:	Socket death error in accept().
+ *		John Richardson :	Fix non blocking error in connect()
+ *					so sockets that fail to connect
+ *					don't return -EINPROGRESS.
+ *		Alan Cox	:	Asynchronous I/O support
+ *		Alan Cox	:	Keep correct socket pointer on sock
+ *					structures
+ *					when accept() ed
+ *		Alan Cox	:	Semantics of SO_LINGER aren't state
+ *					moved to close when you look carefully.
+ *					With this fixed and the accept bug fixed
+ *					some RPC stuff seems happier.
+ *		Niibe Yutaka	:	4.4BSD style write async I/O
+ *		Alan Cox,
+ *		Tony Gale 	:	Fixed reuse semantics.
+ *		Alan Cox	:	bind() shouldn't abort existing but dead
+ *					sockets. Stops FTP netin:.. I hope.
+ *		Alan Cox	:	bind() works correctly for RAW sockets.
+ *					Note that FreeBSD at least was broken
+ *					in this respect so be careful with
+ *					compatibility tests...
+ *		Alan Cox	:	routing cache support
+ *		Alan Cox	:	memzero the socket structure for
+ *					compactness.
+ *		Matt Day	:	nonblock connect error handler
+ *		Alan Cox	:	Allow large numbers of pending sockets
+ *					(eg for big web sites), but only if
+ *					specifically application requested.
+ *		Alan Cox	:	New buffering throughout IP. Used
+ *					dumbly.
+ *		Alan Cox	:	New buffering now used smartly.
+ *		Alan Cox	:	BSD rather than common sense
+ *					interpretation of listen.
+ *		Germano Caronni	:	Assorted small races.
+ *		Alan Cox	:	sendmsg/recvmsg basic support.
+ *		Alan Cox	:	Only sendmsg/recvmsg now supported.
+ *		Alan Cox	:	Locked down bind (see security list).
+ *		Alan Cox	:	Loosened bind a little.
+ *		Mike McLagan	:	ADD/DEL DLCI Ioctls
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Some other random speedups.
+ *		Cyrus Durgin	:	Cleaned up file for kmod hacks.
+ *		Andi Kleen	:	Fix inet_stream_connect TCP race.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/string.h>
+#include <linux/sockios.h>
+#include <linux/net.h>
+#include <linux/fcntl.h>
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+
+#include <linux/smp_lock.h>
+#include <linux/inet.h>
+#include <linux/igmp.h>
+#include <linux/netdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/arp.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/raw.h>
+#include <net/icmp.h>
+#include <net/ipip.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+
+DEFINE_SNMP_STAT(struct linux_mib, net_statistics);
+
+#ifdef INET_REFCNT_DEBUG
+atomic_t inet_sock_nr;
+#endif
+
+extern void ip_mc_drop_socket(struct sock *sk);
+
+/* The inetsw table contains everything that inet_create needs to
+ * build a new socket.
+ */
+static struct list_head inetsw[SOCK_MAX];
+static DEFINE_SPINLOCK(inetsw_lock);
+
+/* New destruction routine */
+
+void inet_sock_destruct(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	__skb_queue_purge(&sk->sk_receive_queue);
+	__skb_queue_purge(&sk->sk_error_queue);
+
+	if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
+		printk("Attempt to release TCP socket in state %d %p\n",
+		       sk->sk_state, sk);
+		return;
+	}
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		printk("Attempt to release alive inet socket %p\n", sk);
+		return;
+	}
+
+	BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
+	BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+	BUG_TRAP(!sk->sk_wmem_queued);
+	BUG_TRAP(!sk->sk_forward_alloc);
+
+	if (inet->opt)
+		kfree(inet->opt);
+	dst_release(sk->sk_dst_cache);
+#ifdef INET_REFCNT_DEBUG
+	atomic_dec(&inet_sock_nr);
+	printk(KERN_DEBUG "INET socket %p released, %d are still alive\n",
+	       sk, atomic_read(&inet_sock_nr));
+#endif
+}
+
+/*
+ *	The routines beyond this point handle the behaviour of an AF_INET
+ *	socket object. Mostly it punts to the subprotocols of IP to do
+ *	the work.
+ */
+
+/*
+ *	Automatically bind an unbound socket.
+ */
+
+static int inet_autobind(struct sock *sk)
+{
+	struct inet_sock *inet;
+	/* We may need to bind the socket. */
+	lock_sock(sk);
+	inet = inet_sk(sk);
+	if (!inet->num) {
+		if (sk->sk_prot->get_port(sk, 0)) {
+			release_sock(sk);
+			return -EAGAIN;
+		}
+		inet->sport = htons(inet->num);
+	}
+	release_sock(sk);
+	return 0;
+}
+
+/*
+ *	Move a socket into listening state.
+ */
+int inet_listen(struct socket *sock, int backlog)
+{
+	struct sock *sk = sock->sk;
+	unsigned char old_state;
+	int err;
+
+	lock_sock(sk);
+
+	err = -EINVAL;
+	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
+		goto out;
+
+	old_state = sk->sk_state;
+	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+		goto out;
+
+	/* Really, if the socket is already in listen state
+	 * we can only allow the backlog to be adjusted.
+	 */
+	if (old_state != TCP_LISTEN) {
+		err = tcp_listen_start(sk);
+		if (err)
+			goto out;
+	}
+	sk->sk_max_ack_backlog = backlog;
+	err = 0;
+
+out:
+	release_sock(sk);
+	return err;
+}
+
+/*
+ *	Create an inet socket.
+ */
+
+static int inet_create(struct socket *sock, int protocol)
+{
+	struct sock *sk;
+	struct list_head *p;
+	struct inet_protosw *answer;
+	struct inet_sock *inet;
+	struct proto *answer_prot;
+	unsigned char answer_flags;
+	char answer_no_check;
+	int err;
+
+	sock->state = SS_UNCONNECTED;
+
+	/* Look for the requested type/protocol pair. */
+	answer = NULL;
+	rcu_read_lock();
+	list_for_each_rcu(p, &inetsw[sock->type]) {
+		answer = list_entry(p, struct inet_protosw, list);
+
+		/* Check the non-wild match. */
+		if (protocol == answer->protocol) {
+			if (protocol != IPPROTO_IP)
+				break;
+		} else {
+			/* Check for the two wild cases. */
+			if (IPPROTO_IP == protocol) {
+				protocol = answer->protocol;
+				break;
+			}
+			if (IPPROTO_IP == answer->protocol)
+				break;
+		}
+		answer = NULL;
+	}
+
+	err = -ESOCKTNOSUPPORT;
+	if (!answer)
+		goto out_rcu_unlock;
+	err = -EPERM;
+	if (answer->capability > 0 && !capable(answer->capability))
+		goto out_rcu_unlock;
+	err = -EPROTONOSUPPORT;
+	if (!protocol)
+		goto out_rcu_unlock;
+
+	sock->ops = answer->ops;
+	answer_prot = answer->prot;
+	answer_no_check = answer->no_check;
+	answer_flags = answer->flags;
+	rcu_read_unlock();
+
+	BUG_TRAP(answer_prot->slab != NULL);
+
+	err = -ENOBUFS;
+	sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1);
+	if (sk == NULL)
+		goto out;
+
+	err = 0;
+	sk->sk_no_check = answer_no_check;
+	if (INET_PROTOSW_REUSE & answer_flags)
+		sk->sk_reuse = 1;
+
+	inet = inet_sk(sk);
+
+	if (SOCK_RAW == sock->type) {
+		inet->num = protocol;
+		if (IPPROTO_RAW == protocol)
+			inet->hdrincl = 1;
+	}
+
+	if (ipv4_config.no_pmtu_disc)
+		inet->pmtudisc = IP_PMTUDISC_DONT;
+	else
+		inet->pmtudisc = IP_PMTUDISC_WANT;
+
+	inet->id = 0;
+
+	sock_init_data(sock, sk);
+
+	sk->sk_destruct	   = inet_sock_destruct;
+	sk->sk_family	   = PF_INET;
+	sk->sk_protocol	   = protocol;
+	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
+
+	inet->uc_ttl	= -1;
+	inet->mc_loop	= 1;
+	inet->mc_ttl	= 1;
+	inet->mc_index	= 0;
+	inet->mc_list	= NULL;
+
+#ifdef INET_REFCNT_DEBUG
+	atomic_inc(&inet_sock_nr);
+#endif
+
+	if (inet->num) {
+		/* It assumes that any protocol which allows
+		 * the user to assign a number at socket
+		 * creation time automatically
+		 * shares.
+		 */
+		inet->sport = htons(inet->num);
+		/* Add to protocol hash chains. */
+		sk->sk_prot->hash(sk);
+	}
+
+	if (sk->sk_prot->init) {
+		err = sk->sk_prot->init(sk);
+		if (err)
+			sk_common_release(sk);
+	}
+out:
+	return err;
+out_rcu_unlock:
+	rcu_read_unlock();
+	goto out;
+}
+
+
+/*
+ *	The peer socket should always be NULL (or else). When we call this
+ *	function we are destroying the object and from then on nobody
+ *	should refer to it.
+ */
+int inet_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+
+	if (sk) {
+		long timeout;
+
+		/* Applications forget to leave groups before exiting */
+		ip_mc_drop_socket(sk);
+
+		/* If linger is set, we don't return until the close
+		 * is complete.  Otherwise we return immediately. The
+		 * actually closing is done the same either way.
+		 *
+		 * If the close is due to the process exiting, we never
+		 * linger..
+		 */
+		timeout = 0;
+		if (sock_flag(sk, SOCK_LINGER) &&
+		    !(current->flags & PF_EXITING))
+			timeout = sk->sk_lingertime;
+		sock->sk = NULL;
+		sk->sk_prot->close(sk, timeout);
+	}
+	return 0;
+}
+
+/* It is off by default, see below. */
+int sysctl_ip_nonlocal_bind;
+
+int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
+{
+	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
+	struct sock *sk = sock->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	unsigned short snum;
+	int chk_addr_ret;
+	int err;
+
+	/* If the socket has its own bind function then use it. (RAW) */
+	if (sk->sk_prot->bind) {
+		err = sk->sk_prot->bind(sk, uaddr, addr_len);
+		goto out;
+	}
+	err = -EINVAL;
+	if (addr_len < sizeof(struct sockaddr_in))
+		goto out;
+
+	chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+
+	/* Not specified by any standard per-se, however it breaks too
+	 * many applications when removed.  It is unfortunate since
+	 * allowing applications to make a non-local bind solves
+	 * several problems with systems using dynamic addressing.
+	 * (ie. your servers still start up even if your ISDN link
+	 *  is temporarily down)
+	 */
+	err = -EADDRNOTAVAIL;
+	if (!sysctl_ip_nonlocal_bind &&
+	    !inet->freebind &&
+	    addr->sin_addr.s_addr != INADDR_ANY &&
+	    chk_addr_ret != RTN_LOCAL &&
+	    chk_addr_ret != RTN_MULTICAST &&
+	    chk_addr_ret != RTN_BROADCAST)
+		goto out;
+
+	snum = ntohs(addr->sin_port);
+	err = -EACCES;
+	if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		goto out;
+
+	/*      We keep a pair of addresses. rcv_saddr is the one
+	 *      used by hash lookups, and saddr is used for transmit.
+	 *
+	 *      In the BSD API these are the same except where it
+	 *      would be illegal to use them (multicast/broadcast) in
+	 *      which case the sending device address is used.
+	 */
+	lock_sock(sk);
+
+	/* Check these errors (active socket, double bind). */
+	err = -EINVAL;
+	if (sk->sk_state != TCP_CLOSE || inet->num)
+		goto out_release_sock;
+
+	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->saddr = 0;  /* Use device */
+
+	/* Make sure we are allowed to bind here. */
+	if (sk->sk_prot->get_port(sk, snum)) {
+		inet->saddr = inet->rcv_saddr = 0;
+		err = -EADDRINUSE;
+		goto out_release_sock;
+	}
+
+	if (inet->rcv_saddr)
+		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
+	if (snum)
+		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
+	inet->sport = htons(inet->num);
+	inet->daddr = 0;
+	inet->dport = 0;
+	sk_dst_reset(sk);
+	err = 0;
+out_release_sock:
+	release_sock(sk);
+out:
+	return err;
+}
+
+int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
+		       int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+
+	if (uaddr->sa_family == AF_UNSPEC)
+		return sk->sk_prot->disconnect(sk, flags);
+
+	if (!inet_sk(sk)->num && inet_autobind(sk))
+		return -EAGAIN;
+	return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
+}
+
+static long inet_wait_for_connect(struct sock *sk, long timeo)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+
+	/* Basic assumption: if someone sets sk->sk_err, he _must_
+	 * change state of the socket from TCP_SYN_*.
+	 * Connect() does not allow to get error notifications
+	 * without closing the socket.
+	 */
+	while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		release_sock(sk);
+		timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		if (signal_pending(current) || !timeo)
+			break;
+		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+	}
+	finish_wait(sk->sk_sleep, &wait);
+	return timeo;
+}
+
+/*
+ *	Connect to a remote host. There is regrettably still a little
+ *	TCP 'magic' in here.
+ */
+int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
+			int addr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	int err;
+	long timeo;
+
+	lock_sock(sk);
+
+	if (uaddr->sa_family == AF_UNSPEC) {
+		err = sk->sk_prot->disconnect(sk, flags);
+		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+		goto out;
+	}
+
+	switch (sock->state) {
+	default:
+		err = -EINVAL;
+		goto out;
+	case SS_CONNECTED:
+		err = -EISCONN;
+		goto out;
+	case SS_CONNECTING:
+		err = -EALREADY;
+		/* Fall out of switch with err, set for this state */
+		break;
+	case SS_UNCONNECTED:
+		err = -EISCONN;
+		if (sk->sk_state != TCP_CLOSE)
+			goto out;
+
+		err = sk->sk_prot->connect(sk, uaddr, addr_len);
+		if (err < 0)
+			goto out;
+
+  		sock->state = SS_CONNECTING;
+
+		/* Just entered SS_CONNECTING state; the only
+		 * difference is that return value in non-blocking
+		 * case is EINPROGRESS, rather than EALREADY.
+		 */
+		err = -EINPROGRESS;
+		break;
+	}
+
+	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		/* Error code is set above */
+		if (!timeo || !inet_wait_for_connect(sk, timeo))
+			goto out;
+
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			goto out;
+	}
+
+	/* Connection was closed by RST, timeout, ICMP error
+	 * or another process disconnected us.
+	 */
+	if (sk->sk_state == TCP_CLOSE)
+		goto sock_error;
+
+	/* sk->sk_err may be not zero now, if RECVERR was ordered by user
+	 * and error was received after socket entered established state.
+	 * Hence, it is handled normally after connect() return successfully.
+	 */
+
+	sock->state = SS_CONNECTED;
+	err = 0;
+out:
+	release_sock(sk);
+	return err;
+
+sock_error:
+	err = sock_error(sk) ? : -ECONNABORTED;
+	sock->state = SS_UNCONNECTED;
+	if (sk->sk_prot->disconnect(sk, flags))
+		sock->state = SS_DISCONNECTING;
+	goto out;
+}
+
+/*
+ *	Accept a pending connection. The TCP layer now gives BSD semantics.
+ */
+
+int inet_accept(struct socket *sock, struct socket *newsock, int flags)
+{
+	struct sock *sk1 = sock->sk;
+	int err = -EINVAL;
+	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
+
+	if (!sk2)
+		goto do_err;
+
+	lock_sock(sk2);
+
+	BUG_TRAP((1 << sk2->sk_state) &
+		 (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+
+	sock_graft(sk2, newsock);
+
+	newsock->state = SS_CONNECTED;
+	err = 0;
+	release_sock(sk2);
+do_err:
+	return err;
+}
+
+
+/*
+ *	This does both peername and sockname.
+ */
+int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+			int *uaddr_len, int peer)
+{
+	struct sock *sk		= sock->sk;
+	struct inet_sock *inet	= inet_sk(sk);
+	struct sockaddr_in *sin	= (struct sockaddr_in *)uaddr;
+
+	sin->sin_family = AF_INET;
+	if (peer) {
+		if (!inet->dport ||
+		    (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
+		     peer == 1))
+			return -ENOTCONN;
+		sin->sin_port = inet->dport;
+		sin->sin_addr.s_addr = inet->daddr;
+	} else {
+		__u32 addr = inet->rcv_saddr;
+		if (!addr)
+			addr = inet->saddr;
+		sin->sin_port = inet->sport;
+		sin->sin_addr.s_addr = addr;
+	}
+	memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+	*uaddr_len = sizeof(*sin);
+	return 0;
+}
+
+int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+		 size_t size)
+{
+	struct sock *sk = sock->sk;
+
+	/* We may need to bind the socket. */
+	if (!inet_sk(sk)->num && inet_autobind(sk))
+		return -EAGAIN;
+
+	return sk->sk_prot->sendmsg(iocb, sk, msg, size);
+}
+
+
+static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+{
+	struct sock *sk = sock->sk;
+
+	/* We may need to bind the socket. */
+	if (!inet_sk(sk)->num && inet_autobind(sk))
+		return -EAGAIN;
+
+	if (sk->sk_prot->sendpage)
+		return sk->sk_prot->sendpage(sk, page, offset, size, flags);
+	return sock_no_sendpage(sock, page, offset, size, flags);
+}
+
+
+int inet_shutdown(struct socket *sock, int how)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	/* This should really check to make sure
+	 * the socket is a TCP socket. (WHY AC...)
+	 */
+	how++; /* maps 0->1 has the advantage of making bit 1 rcvs and
+		       1->2 bit 2 snds.
+		       2->3 */
+	if ((how & ~SHUTDOWN_MASK) || !how)	/* MAXINT->0 */
+		return -EINVAL;
+
+	lock_sock(sk);
+	if (sock->state == SS_CONNECTING) {
+		if ((1 << sk->sk_state) &
+		    (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE))
+			sock->state = SS_DISCONNECTING;
+		else
+			sock->state = SS_CONNECTED;
+	}
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		err = -ENOTCONN;
+		/* Hack to wake up other listeners, who can poll for
+		   POLLHUP, even on eg. unconnected UDP sockets -- RR */
+	default:
+		sk->sk_shutdown |= how;
+		if (sk->sk_prot->shutdown)
+			sk->sk_prot->shutdown(sk, how);
+		break;
+
+	/* Remaining two branches are temporary solution for missing
+	 * close() in multithreaded environment. It is _not_ a good idea,
+	 * but we have no choice until close() is repaired at VFS level.
+	 */
+	case TCP_LISTEN:
+		if (!(how & RCV_SHUTDOWN))
+			break;
+		/* Fall through */
+	case TCP_SYN_SENT:
+		err = sk->sk_prot->disconnect(sk, O_NONBLOCK);
+		sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
+		break;
+	}
+
+	/* Wake up anyone sleeping in poll. */
+	sk->sk_state_change(sk);
+	release_sock(sk);
+	return err;
+}
+
+/*
+ *	ioctl() calls you can issue on an INET socket. Most of these are
+ *	device configuration and stuff and very rarely used. Some ioctls
+ *	pass on to the socket itself.
+ *
+ *	NOTE: I like the idea of a module for the config stuff. ie ifconfig
+ *	loads the devconfigure module does its configuring and unloads it.
+ *	There's a good 20K of config code hanging around the kernel.
+ */
+
+int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+	struct sock *sk = sock->sk;
+	int err = 0;
+
+	switch (cmd) {
+		case SIOCGSTAMP:
+			err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+			break;
+		case SIOCADDRT:
+		case SIOCDELRT:
+		case SIOCRTMSG:
+			err = ip_rt_ioctl(cmd, (void __user *)arg);
+			break;
+		case SIOCDARP:
+		case SIOCGARP:
+		case SIOCSARP:
+			err = arp_ioctl(cmd, (void __user *)arg);
+			break;
+		case SIOCGIFADDR:
+		case SIOCSIFADDR:
+		case SIOCGIFBRDADDR:
+		case SIOCSIFBRDADDR:
+		case SIOCGIFNETMASK:
+		case SIOCSIFNETMASK:
+		case SIOCGIFDSTADDR:
+		case SIOCSIFDSTADDR:
+		case SIOCSIFPFLAGS:
+		case SIOCGIFPFLAGS:
+		case SIOCSIFFLAGS:
+			err = devinet_ioctl(cmd, (void __user *)arg);
+			break;
+		default:
+			if (!sk->sk_prot->ioctl ||
+			    (err = sk->sk_prot->ioctl(sk, cmd, arg)) ==
+			    					-ENOIOCTLCMD)
+				err = dev_ioctl(cmd, (void __user *)arg);
+			break;
+	}
+	return err;
+}
+
+struct proto_ops inet_stream_ops = {
+	.family =	PF_INET,
+	.owner =	THIS_MODULE,
+	.release =	inet_release,
+	.bind =		inet_bind,
+	.connect =	inet_stream_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	inet_accept,
+	.getname =	inet_getname,
+	.poll =		tcp_poll,
+	.ioctl =	inet_ioctl,
+	.listen =	inet_listen,
+	.shutdown =	inet_shutdown,
+	.setsockopt =	sock_common_setsockopt,
+	.getsockopt =	sock_common_getsockopt,
+	.sendmsg =	inet_sendmsg,
+	.recvmsg =	sock_common_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	tcp_sendpage
+};
+
+struct proto_ops inet_dgram_ops = {
+	.family =	PF_INET,
+	.owner =	THIS_MODULE,
+	.release =	inet_release,
+	.bind =		inet_bind,
+	.connect =	inet_dgram_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	inet_getname,
+	.poll =		udp_poll,
+	.ioctl =	inet_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	inet_shutdown,
+	.setsockopt =	sock_common_setsockopt,
+	.getsockopt =	sock_common_getsockopt,
+	.sendmsg =	inet_sendmsg,
+	.recvmsg =	sock_common_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	inet_sendpage,
+};
+
+/*
+ * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
+ * udp_poll
+ */
+static struct proto_ops inet_sockraw_ops = {
+	.family =	PF_INET,
+	.owner =	THIS_MODULE,
+	.release =	inet_release,
+	.bind =		inet_bind,
+	.connect =	inet_dgram_connect,
+	.socketpair =	sock_no_socketpair,
+	.accept =	sock_no_accept,
+	.getname =	inet_getname,
+	.poll =		datagram_poll,
+	.ioctl =	inet_ioctl,
+	.listen =	sock_no_listen,
+	.shutdown =	inet_shutdown,
+	.setsockopt =	sock_common_setsockopt,
+	.getsockopt =	sock_common_getsockopt,
+	.sendmsg =	inet_sendmsg,
+	.recvmsg =	sock_common_recvmsg,
+	.mmap =		sock_no_mmap,
+	.sendpage =	inet_sendpage,
+};
+
+static struct net_proto_family inet_family_ops = {
+	.family = PF_INET,
+	.create = inet_create,
+	.owner	= THIS_MODULE,
+};
+
+
+extern void tcp_init(void);
+extern void tcp_v4_init(struct net_proto_family *);
+
+/* Upon startup we insert all the elements in inetsw_array[] into
+ * the linked list inetsw.
+ */
+static struct inet_protosw inetsw_array[] =
+{
+        {
+                .type =       SOCK_STREAM,
+                .protocol =   IPPROTO_TCP,
+                .prot =       &tcp_prot,
+                .ops =        &inet_stream_ops,
+                .capability = -1,
+                .no_check =   0,
+                .flags =      INET_PROTOSW_PERMANENT,
+        },
+
+        {
+                .type =       SOCK_DGRAM,
+                .protocol =   IPPROTO_UDP,
+                .prot =       &udp_prot,
+                .ops =        &inet_dgram_ops,
+                .capability = -1,
+                .no_check =   UDP_CSUM_DEFAULT,
+                .flags =      INET_PROTOSW_PERMANENT,
+       },
+        
+
+       {
+               .type =       SOCK_RAW,
+               .protocol =   IPPROTO_IP,	/* wild card */
+               .prot =       &raw_prot,
+               .ops =        &inet_sockraw_ops,
+               .capability = CAP_NET_RAW,
+               .no_check =   UDP_CSUM_DEFAULT,
+               .flags =      INET_PROTOSW_REUSE,
+       }
+};
+
+#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw))
+
+void inet_register_protosw(struct inet_protosw *p)
+{
+	struct list_head *lh;
+	struct inet_protosw *answer;
+	int protocol = p->protocol;
+	struct list_head *last_perm;
+
+	spin_lock_bh(&inetsw_lock);
+
+	if (p->type >= SOCK_MAX)
+		goto out_illegal;
+
+	/* If we are trying to override a permanent protocol, bail. */
+	answer = NULL;
+	last_perm = &inetsw[p->type];
+	list_for_each(lh, &inetsw[p->type]) {
+		answer = list_entry(lh, struct inet_protosw, list);
+
+		/* Check only the non-wild match. */
+		if (INET_PROTOSW_PERMANENT & answer->flags) {
+			if (protocol == answer->protocol)
+				break;
+			last_perm = lh;
+		}
+
+		answer = NULL;
+	}
+	if (answer)
+		goto out_permanent;
+
+	/* Add the new entry after the last permanent entry if any, so that
+	 * the new entry does not override a permanent entry when matched with
+	 * a wild-card protocol. But it is allowed to override any existing
+	 * non-permanent entry.  This means that when we remove this entry, the 
+	 * system automatically returns to the old behavior.
+	 */
+	list_add_rcu(&p->list, last_perm);
+out:
+	spin_unlock_bh(&inetsw_lock);
+
+	synchronize_net();
+
+	return;
+
+out_permanent:
+	printk(KERN_ERR "Attempt to override permanent protocol %d.\n",
+	       protocol);
+	goto out;
+
+out_illegal:
+	printk(KERN_ERR
+	       "Ignoring attempt to register invalid socket type %d.\n",
+	       p->type);
+	goto out;
+}
+
+void inet_unregister_protosw(struct inet_protosw *p)
+{
+	if (INET_PROTOSW_PERMANENT & p->flags) {
+		printk(KERN_ERR
+		       "Attempt to unregister permanent protocol %d.\n",
+		       p->protocol);
+	} else {
+		spin_lock_bh(&inetsw_lock);
+		list_del_rcu(&p->list);
+		spin_unlock_bh(&inetsw_lock);
+
+		synchronize_net();
+	}
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static struct net_protocol igmp_protocol = {
+	.handler =	igmp_rcv,
+};
+#endif
+
+static struct net_protocol tcp_protocol = {
+	.handler =	tcp_v4_rcv,
+	.err_handler =	tcp_v4_err,
+	.no_policy =	1,
+};
+
+static struct net_protocol udp_protocol = {
+	.handler =	udp_rcv,
+	.err_handler =	udp_err,
+	.no_policy =	1,
+};
+
+static struct net_protocol icmp_protocol = {
+	.handler =	icmp_rcv,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+	net_statistics[0] = alloc_percpu(struct linux_mib);
+	net_statistics[1] = alloc_percpu(struct linux_mib);
+	ip_statistics[0] = alloc_percpu(struct ipstats_mib);
+	ip_statistics[1] = alloc_percpu(struct ipstats_mib);
+	icmp_statistics[0] = alloc_percpu(struct icmp_mib);
+	icmp_statistics[1] = alloc_percpu(struct icmp_mib);
+	tcp_statistics[0] = alloc_percpu(struct tcp_mib);
+	tcp_statistics[1] = alloc_percpu(struct tcp_mib);
+	udp_statistics[0] = alloc_percpu(struct udp_mib);
+	udp_statistics[1] = alloc_percpu(struct udp_mib);
+	if (!
+	    (net_statistics[0] && net_statistics[1] && ip_statistics[0]
+	     && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1]
+	     && udp_statistics[0] && udp_statistics[1]))
+		return -ENOMEM;
+
+	(void) tcp_mib_init();
+
+	return 0;
+}
+
+static int ipv4_proc_init(void);
+extern void ipfrag_init(void);
+
+static int __init inet_init(void)
+{
+	struct sk_buff *dummy_skb;
+	struct inet_protosw *q;
+	struct list_head *r;
+	int rc = -EINVAL;
+
+	if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) {
+		printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
+		goto out;
+	}
+
+	rc = proto_register(&tcp_prot, 1);
+	if (rc)
+		goto out;
+
+	rc = proto_register(&udp_prot, 1);
+	if (rc)
+		goto out_unregister_tcp_proto;
+
+	rc = proto_register(&raw_prot, 1);
+	if (rc)
+		goto out_unregister_udp_proto;
+
+	/*
+	 *	Tell SOCKET that we are alive... 
+	 */
+
+  	(void)sock_register(&inet_family_ops);
+
+	/*
+	 *	Add all the base protocols.
+	 */
+
+	if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n");
+	if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n");
+	if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n");
+#ifdef CONFIG_IP_MULTICAST
+	if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)
+		printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n");
+#endif
+
+	/* Register the socket-side information for inet_create. */
+	for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
+		INIT_LIST_HEAD(r);
+
+	for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
+		inet_register_protosw(q);
+
+	/*
+	 *	Set the ARP module up
+	 */
+
+	arp_init();
+
+  	/*
+  	 *	Set the IP module up
+  	 */
+
+	ip_init();
+
+	tcp_v4_init(&inet_family_ops);
+
+	/* Setup TCP slab cache for open requests. */
+	tcp_init();
+
+
+	/*
+	 *	Set the ICMP layer up
+	 */
+
+	icmp_init(&inet_family_ops);
+
+	/*
+	 *	Initialise the multicast router
+	 */
+#if defined(CONFIG_IP_MROUTE)
+	ip_mr_init();
+#endif
+	/*
+	 *	Initialise per-cpu ipv4 mibs
+	 */ 
+
+	if(init_ipv4_mibs())
+		printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+	
+	ipv4_proc_init();
+
+	ipfrag_init();
+
+	rc = 0;
+out:
+	return rc;
+out_unregister_tcp_proto:
+	proto_unregister(&tcp_prot);
+out_unregister_udp_proto:
+	proto_unregister(&udp_prot);
+	goto out;
+}
+
+module_init(inet_init);
+
+/* ------------------------------------------------------------------------ */
+
+#ifdef CONFIG_PROC_FS
+extern int  fib_proc_init(void);
+extern void fib_proc_exit(void);
+extern int  ip_misc_proc_init(void);
+extern int  raw_proc_init(void);
+extern void raw_proc_exit(void);
+extern int  tcp4_proc_init(void);
+extern void tcp4_proc_exit(void);
+extern int  udp4_proc_init(void);
+extern void udp4_proc_exit(void);
+
+static int __init ipv4_proc_init(void)
+{
+	int rc = 0;
+
+	if (raw_proc_init())
+		goto out_raw;
+	if (tcp4_proc_init())
+		goto out_tcp;
+	if (udp4_proc_init())
+		goto out_udp;
+	if (fib_proc_init())
+		goto out_fib;
+	if (ip_misc_proc_init())
+		goto out_misc;
+out:
+	return rc;
+out_misc:
+	fib_proc_exit();
+out_fib:
+	udp4_proc_exit();
+out_udp:
+	tcp4_proc_exit();
+out_tcp:
+	raw_proc_exit();
+out_raw:
+	rc = -ENOMEM;
+	goto out;
+}
+
+#else /* CONFIG_PROC_FS */
+static int __init ipv4_proc_init(void)
+{
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
+MODULE_ALIAS_NETPROTO(PF_INET);
+
+EXPORT_SYMBOL(inet_accept);
+EXPORT_SYMBOL(inet_bind);
+EXPORT_SYMBOL(inet_dgram_connect);
+EXPORT_SYMBOL(inet_dgram_ops);
+EXPORT_SYMBOL(inet_getname);
+EXPORT_SYMBOL(inet_ioctl);
+EXPORT_SYMBOL(inet_listen);
+EXPORT_SYMBOL(inet_register_protosw);
+EXPORT_SYMBOL(inet_release);
+EXPORT_SYMBOL(inet_sendmsg);
+EXPORT_SYMBOL(inet_shutdown);
+EXPORT_SYMBOL(inet_sock_destruct);
+EXPORT_SYMBOL(inet_stream_connect);
+EXPORT_SYMBOL(inet_stream_ops);
+EXPORT_SYMBOL(inet_unregister_protosw);
+EXPORT_SYMBOL(net_statistics);
+
+#ifdef INET_REFCNT_DEBUG
+EXPORT_SYMBOL(inet_sock_nr);
+#endif
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
new file mode 100644
index 000000000000..0e98f2235b6e
--- /dev/null
+++ b/net/ipv4/ah4.c
@@ -0,0 +1,335 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/ah.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <net/icmp.h>
+#include <asm/scatterlist.h>
+
+
+/* Clear mutable options and find final destination to substitute
+ * into IP header for icv calculation. Options are already checked
+ * for validity, so paranoia is not required. */
+
+static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr)
+{
+	unsigned char * optptr = (unsigned char*)(iph+1);
+	int  l = iph->ihl*4 - sizeof(struct iphdr);
+	int  optlen;
+
+	while (l > 0) {
+		switch (*optptr) {
+		case IPOPT_END:
+			return 0;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l)
+			return -EINVAL;
+		switch (*optptr) {
+		case IPOPT_SEC:
+		case 0x85:	/* Some "Extended Security" crap. */
+		case 0x86:	/* Another "Commercial Security" crap. */
+		case IPOPT_RA:
+		case 0x80|21:	/* RFC1770 */
+			break;
+		case IPOPT_LSRR:
+		case IPOPT_SSRR:
+			if (optlen < 6)
+				return -EINVAL;
+			memcpy(daddr, optptr+optlen-4, 4);
+			/* Fall through */
+		default:
+			memset(optptr+2, 0, optlen-2);
+		}
+		l -= optlen;
+		optptr += optlen;
+	}
+	return 0;
+}
+
+static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	struct iphdr *iph, *top_iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+	union {
+		struct iphdr	iph;
+		char 		buf[60];
+	} tmp_iph;
+
+	top_iph = skb->nh.iph;
+	iph = &tmp_iph.iph;
+
+	iph->tos = top_iph->tos;
+	iph->ttl = top_iph->ttl;
+	iph->frag_off = top_iph->frag_off;
+
+	if (top_iph->ihl != 5) {
+		iph->daddr = top_iph->daddr;
+		memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+		err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
+		if (err)
+			goto error;
+	}
+
+	ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4);
+	ah->nexthdr = top_iph->protocol;
+
+	top_iph->tos = 0;
+	top_iph->tot_len = htons(skb->len);
+	top_iph->frag_off = 0;
+	top_iph->ttl = 0;
+	top_iph->protocol = IPPROTO_AH;
+	top_iph->check = 0;
+
+	ahp = x->data;
+	ah->hdrlen  = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + 
+				   ahp->icv_trunc_len) >> 2) - 2;
+
+	ah->reserved = 0;
+	ah->spi = x->id.spi;
+	ah->seq_no = htonl(++x->replay.oseq);
+	ahp->icv(ahp, skb, ah->auth_data);
+
+	top_iph->tos = iph->tos;
+	top_iph->ttl = iph->ttl;
+	top_iph->frag_off = iph->frag_off;
+	if (top_iph->ihl != 5) {
+		top_iph->daddr = iph->daddr;
+		memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+	}
+
+	ip_send_check(top_iph);
+
+	err = 0;
+
+error:
+	return err;
+}
+
+static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+	int ah_hlen;
+	struct iphdr *iph;
+	struct ip_auth_hdr *ah;
+	struct ah_data *ahp;
+	char work_buf[60];
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr)))
+		goto out;
+
+	ah = (struct ip_auth_hdr*)skb->data;
+	ahp = x->data;
+	ah_hlen = (ah->hdrlen + 2) << 2;
+	
+	if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) &&
+	    ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) 
+		goto out;
+
+	if (!pskb_may_pull(skb, ah_hlen))
+		goto out;
+
+	/* We are going to _remove_ AH header to keep sockets happy,
+	 * so... Later this can change. */
+	if (skb_cloned(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		goto out;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	ah = (struct ip_auth_hdr*)skb->data;
+	iph = skb->nh.iph;
+
+	memcpy(work_buf, iph, iph->ihl*4);
+
+	iph->ttl = 0;
+	iph->tos = 0;
+	iph->frag_off = 0;
+	iph->check = 0;
+	if (iph->ihl != 5) {
+		u32 dummy;
+		if (ip_clear_mutable_options(iph, &dummy))
+			goto out;
+	}
+        {
+		u8 auth_data[MAX_AH_AUTH_LEN];
+		
+		memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+		skb_push(skb, skb->data - skb->nh.raw);
+		ahp->icv(ahp, skb, ah->auth_data);
+		if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) {
+			x->stats.integrity_failed++;
+			goto out;
+		}
+	}
+	((struct iphdr*)work_buf)->protocol = ah->nexthdr;
+	skb->nh.raw = skb_pull(skb, ah_hlen);
+	memcpy(skb->nh.raw, work_buf, iph->ihl*4);
+	skb->nh.iph->tot_len = htons(skb->len);
+	skb_pull(skb, skb->nh.iph->ihl*4);
+	skb->h.raw = skb->data;
+
+	return 0;
+
+out:
+	return -EINVAL;
+}
+
+static void ah4_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr*)skb->data;
+	struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+	    skb->h.icmph->code != ICMP_FRAG_NEEDED)
+		return;
+
+	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+	if (!x)
+		return;
+	printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
+	       ntohl(ah->spi), ntohl(iph->daddr));
+	xfrm_state_put(x);
+}
+
+static int ah_init_state(struct xfrm_state *x, void *args)
+{
+	struct ah_data *ahp = NULL;
+	struct xfrm_algo_desc *aalg_desc;
+
+	if (!x->aalg)
+		goto error;
+
+	/* null auth can use a zero length key */
+	if (x->aalg->alg_key_len > 512)
+		goto error;
+
+	if (x->encap)
+		goto error;
+
+	ahp = kmalloc(sizeof(*ahp), GFP_KERNEL);
+	if (ahp == NULL)
+		return -ENOMEM;
+
+	memset(ahp, 0, sizeof(*ahp));
+
+	ahp->key = x->aalg->alg_key;
+	ahp->key_len = (x->aalg->alg_key_len+7)/8;
+	ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+	if (!ahp->tfm)
+		goto error;
+	ahp->icv = ah_hmac_digest;
+	
+	/*
+	 * Lookup the algorithm description maintained by xfrm_algo,
+	 * verify crypto transform properties, and store information
+	 * we need for AH processing.  This lookup cannot fail here
+	 * after a successful crypto_alloc_tfm().
+	 */
+	aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+	BUG_ON(!aalg_desc);
+
+	if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+	    crypto_tfm_alg_digestsize(ahp->tfm)) {
+		printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
+		       x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm),
+		       aalg_desc->uinfo.auth.icv_fullbits/8);
+		goto error;
+	}
+	
+	ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+	ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+	
+	BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
+	
+	ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
+	if (!ahp->work_icv)
+		goto error;
+	
+	x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len);
+	if (x->props.mode)
+		x->props.header_len += sizeof(struct iphdr);
+	x->data = ahp;
+
+	return 0;
+
+error:
+	if (ahp) {
+		if (ahp->work_icv)
+			kfree(ahp->work_icv);
+		if (ahp->tfm)
+			crypto_free_tfm(ahp->tfm);
+		kfree(ahp);
+	}
+	return -EINVAL;
+}
+
+static void ah_destroy(struct xfrm_state *x)
+{
+	struct ah_data *ahp = x->data;
+
+	if (!ahp)
+		return;
+
+	if (ahp->work_icv) {
+		kfree(ahp->work_icv);
+		ahp->work_icv = NULL;
+	}
+	if (ahp->tfm) {
+		crypto_free_tfm(ahp->tfm);
+		ahp->tfm = NULL;
+	}
+	kfree(ahp);
+}
+
+
+static struct xfrm_type ah_type =
+{
+	.description	= "AH4",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_AH,
+	.init_state	= ah_init_state,
+	.destructor	= ah_destroy,
+	.input		= ah_input,
+	.output		= ah_output
+};
+
+static struct net_protocol ah4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.err_handler	=	ah4_err,
+	.no_policy	=	1,
+};
+
+static int __init ah4_init(void)
+{
+	if (xfrm_register_type(&ah_type, AF_INET) < 0) {
+		printk(KERN_INFO "ip ah init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) {
+		printk(KERN_INFO "ip ah init: can't add protocol\n");
+		xfrm_unregister_type(&ah_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ah4_fini(void)
+{
+	if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0)
+		printk(KERN_INFO "ip ah close: can't remove protocol\n");
+	if (xfrm_unregister_type(&ah_type, AF_INET) < 0)
+		printk(KERN_INFO "ip ah close: can't remove xfrm type\n");
+}
+
+module_init(ah4_init);
+module_exit(ah4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
new file mode 100644
index 000000000000..a642fd612853
--- /dev/null
+++ b/net/ipv4/arp.c
@@ -0,0 +1,1425 @@
+/* linux/net/inet/arp.c
+ *
+ * Version:	$Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $
+ *
+ * Copyright (C) 1994 by Florian  La Roche
+ *
+ * This module implements the Address Resolution Protocol ARP (RFC 826),
+ * which is used to convert IP addresses (or in the future maybe other
+ * high-level addresses) into a low-level hardware address (like an Ethernet
+ * address).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ *		Alan Cox	:	Removed the Ethernet assumptions in 
+ *					Florian's code
+ *		Alan Cox	:	Fixed some small errors in the ARP 
+ *					logic
+ *		Alan Cox	:	Allow >4K in /proc
+ *		Alan Cox	:	Make ARP add its own protocol entry
+ *		Ross Martin     :       Rewrote arp_rcv() and arp_get_info()
+ *		Stephen Henson	:	Add AX25 support to arp_get_info()
+ *		Alan Cox	:	Drop data when a device is downed.
+ *		Alan Cox	:	Use init_timer().
+ *		Alan Cox	:	Double lock fixes.
+ *		Martin Seine	:	Move the arphdr structure
+ *					to if_arp.h for compatibility.
+ *					with BSD based programs.
+ *		Andrew Tridgell :       Added ARP netmask code and
+ *					re-arranged proxy handling.
+ *		Alan Cox	:	Changed to use notifiers.
+ *		Niibe Yutaka	:	Reply for this device or proxies only.
+ *		Alan Cox	:	Don't proxy across hardware types!
+ *		Jonathan Naylor :	Added support for NET/ROM.
+ *		Mike Shaver     :       RFC1122 checks.
+ *		Jonathan Naylor :	Only lookup the hardware address for
+ *					the correct hardware type.
+ *		Germano Caronni	:	Assorted subtle races.
+ *		Craig Schlenter :	Don't modify permanent entry 
+ *					during arp_rcv.
+ *		Russ Nelson	:	Tidied up a few bits.
+ *		Alexey Kuznetsov:	Major changes to caching and behaviour,
+ *					eg intelligent arp probing and 
+ *					generation
+ *					of host down events.
+ *		Alan Cox	:	Missing unlock in device events.
+ *		Eckes		:	ARP ioctl control errors.
+ *		Alexey Kuznetsov:	Arp free fix.
+ *		Manuel Rodriguez:	Gratuitous ARP.
+ *              Jonathan Layes  :       Added arpd support through kerneld 
+ *                                      message queue (960314)
+ *		Mike Shaver	:	/proc/sys/net/ipv4/arp_* support
+ *		Mike McLagan    :	Routing by source
+ *		Stuart Cheshire	:	Metricom and grat arp fixes
+ *					*** FOR 2.1 clean this up ***
+ *		Lawrence V. Stefani: (08/12/96) Added FDDI support.
+ *		Alan Cox 	:	Took the AP1000 nasty FDDI hack and
+ *					folded into the mainstream FDDI code.
+ *					Ack spit, Linus how did you allow that
+ *					one in...
+ *		Jes Sorensen	:	Make FDDI work again in 2.1.x and
+ *					clean up the APFDDI & gen. FDDI bits.
+ *		Alexey Kuznetsov:	new arp state machine;
+ *					now it is in net/core/neighbour.c.
+ *		Krzysztof Halasa:	Added Frame Relay ARP support.
+ *		Arnaldo C. Melo :	convert /proc/net/arp to seq_file
+ *		Shmulik Hen:		Split arp_send to arp_create and
+ *					arp_xmit so intermediate drivers like
+ *					bonding can change the skb before
+ *					sending (e.g. insert 8021q tag).
+ *		Harald Welte	:	convert to make use of jenkins hash
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/fddidevice.h>
+#include <linux/if_arp.h>
+#include <linux/trdevice.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+#include <net/ax25.h>
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+#include <net/netrom.h>
+#endif
+#endif
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+#include <net/atmclip.h>
+struct neigh_table *clip_tbl_hook;
+#endif
+
+#include <asm/system.h>
+#include <asm/uaccess.h>
+
+#include <linux/netfilter_arp.h>
+
+/*
+ *	Interface to generic neighbour cache.
+ */
+static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static int arp_constructor(struct neighbour *neigh);
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
+static void parp_redo(struct sk_buff *skb);
+
+static struct neigh_ops arp_generic_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_connected_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static struct neigh_ops arp_hh_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_resolve_output,
+	.connected_output =	neigh_resolve_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+static struct neigh_ops arp_direct_ops = {
+	.family =		AF_INET,
+	.output =		dev_queue_xmit,
+	.connected_output =	dev_queue_xmit,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+struct neigh_ops arp_broken_ops = {
+	.family =		AF_INET,
+	.solicit =		arp_solicit,
+	.error_report =		arp_error_report,
+	.output =		neigh_compat_output,
+	.connected_output =	neigh_compat_output,
+	.hh_output =		dev_queue_xmit,
+	.queue_xmit =		dev_queue_xmit,
+};
+
+struct neigh_table arp_tbl = {
+	.family =	AF_INET,
+	.entry_size =	sizeof(struct neighbour) + 4,
+	.key_len =	4,
+	.hash =		arp_hash,
+	.constructor =	arp_constructor,
+	.proxy_redo =	parp_redo,
+	.id =		"arp_cache",
+	.parms = {
+		.tbl =			&arp_tbl,
+		.base_reachable_time =	30 * HZ,
+		.retrans_time =	1 * HZ,
+		.gc_staletime =	60 * HZ,
+		.reachable_time =		30 * HZ,
+		.delay_probe_time =	5 * HZ,
+		.queue_len =		3,
+		.ucast_probes =	3,
+		.mcast_probes =	3,
+		.anycast_delay =	1 * HZ,
+		.proxy_delay =		(8 * HZ) / 10,
+		.proxy_qlen =		64,
+		.locktime =		1 * HZ,
+	},
+	.gc_interval =	30 * HZ,
+	.gc_thresh1 =	128,
+	.gc_thresh2 =	512,
+	.gc_thresh3 =	1024,
+};
+
+int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir)
+{
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802:
+		ip_eth_mc_map(addr, haddr);
+		return 0; 
+	case ARPHRD_IEEE802_TR:
+		ip_tr_mc_map(addr, haddr);
+		return 0;
+	case ARPHRD_INFINIBAND:
+		ip_ib_mc_map(addr, haddr);
+		return 0;
+	default:
+		if (dir) {
+			memcpy(haddr, dev->broadcast, dev->addr_len);
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+
+static u32 arp_hash(const void *pkey, const struct net_device *dev)
+{
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+}
+
+static int arp_constructor(struct neighbour *neigh)
+{
+	u32 addr = *(u32*)neigh->primary_key;
+	struct net_device *dev = neigh->dev;
+	struct in_device *in_dev;
+	struct neigh_parms *parms;
+
+	neigh->type = inet_addr_type(addr);
+
+	rcu_read_lock();
+	in_dev = rcu_dereference(__in_dev_get(dev));
+	if (in_dev == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+
+	parms = in_dev->arp_parms;
+	__neigh_parms_put(neigh->parms);
+	neigh->parms = neigh_parms_clone(parms);
+	rcu_read_unlock();
+
+	if (dev->hard_header == NULL) {
+		neigh->nud_state = NUD_NOARP;
+		neigh->ops = &arp_direct_ops;
+		neigh->output = neigh->ops->queue_xmit;
+	} else {
+		/* Good devices (checked by reading texts, but only Ethernet is
+		   tested)
+
+		   ARPHRD_ETHER: (ethernet, apfddi)
+		   ARPHRD_FDDI: (fddi)
+		   ARPHRD_IEEE802: (tr)
+		   ARPHRD_METRICOM: (strip)
+		   ARPHRD_ARCNET:
+		   etc. etc. etc.
+
+		   ARPHRD_IPDDP will also work, if author repairs it.
+		   I did not it, because this driver does not work even
+		   in old paradigm.
+		 */
+
+#if 1
+		/* So... these "amateur" devices are hopeless.
+		   The only thing, that I can say now:
+		   It is very sad that we need to keep ugly obsolete
+		   code to make them happy.
+
+		   They should be moved to more reasonable state, now
+		   they use rebuild_header INSTEAD OF hard_start_xmit!!!
+		   Besides that, they are sort of out of date
+		   (a lot of redundant clones/copies, useless in 2.1),
+		   I wonder why people believe that they work.
+		 */
+		switch (dev->type) {
+		default:
+			break;
+		case ARPHRD_ROSE:	
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+		case ARPHRD_AX25:
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+		case ARPHRD_NETROM:
+#endif
+			neigh->ops = &arp_broken_ops;
+			neigh->output = neigh->ops->output;
+			return 0;
+#endif
+		;}
+#endif
+		if (neigh->type == RTN_MULTICAST) {
+			neigh->nud_state = NUD_NOARP;
+			arp_mc_map(addr, neigh->ha, dev, 1);
+		} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
+		} else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+			neigh->nud_state = NUD_NOARP;
+			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
+		}
+		if (dev->hard_header_cache)
+			neigh->ops = &arp_hh_ops;
+		else
+			neigh->ops = &arp_generic_ops;
+		if (neigh->nud_state&NUD_VALID)
+			neigh->output = neigh->ops->connected_output;
+		else
+			neigh->output = neigh->ops->output;
+	}
+	return 0;
+}
+
+static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb)
+{
+	dst_link_failure(skb);
+	kfree_skb(skb);
+}
+
+static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
+{
+	u32 saddr = 0;
+	u8  *dst_ha = NULL;
+	struct net_device *dev = neigh->dev;
+	u32 target = *(u32*)neigh->primary_key;
+	int probes = atomic_read(&neigh->probes);
+	struct in_device *in_dev = in_dev_get(dev);
+
+	if (!in_dev)
+		return;
+
+	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
+	default:
+	case 0:		/* By default announce any local IP */
+		if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL)
+			saddr = skb->nh.iph->saddr;
+		break;
+	case 1:		/* Restrict announcements of saddr in same subnet */
+		if (!skb)
+			break;
+		saddr = skb->nh.iph->saddr;
+		if (inet_addr_type(saddr) == RTN_LOCAL) {
+			/* saddr should be known to target */
+			if (inet_addr_onlink(in_dev, target, saddr))
+				break;
+		}
+		saddr = 0;
+		break;
+	case 2:		/* Avoid secondary IPs, get a primary/preferred one */
+		break;
+	}
+
+	if (in_dev)
+		in_dev_put(in_dev);
+	if (!saddr)
+		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
+
+	if ((probes -= neigh->parms->ucast_probes) < 0) {
+		if (!(neigh->nud_state&NUD_VALID))
+			printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+		dst_ha = neigh->ha;
+		read_lock_bh(&neigh->lock);
+	} else if ((probes -= neigh->parms->app_probes) < 0) {
+#ifdef CONFIG_ARPD
+		neigh_app_ns(neigh);
+#endif
+		return;
+	}
+
+	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
+		 dst_ha, dev->dev_addr, NULL);
+	if (dst_ha)
+		read_unlock_bh(&neigh->lock);
+}
+
+static int arp_ignore(struct in_device *in_dev, struct net_device *dev,
+		      u32 sip, u32 tip)
+{
+	int scope;
+
+	switch (IN_DEV_ARP_IGNORE(in_dev)) {
+	case 0:	/* Reply, the tip is already validated */
+		return 0;
+	case 1:	/* Reply only if tip is configured on the incoming interface */
+		sip = 0;
+		scope = RT_SCOPE_HOST;
+		break;
+	case 2:	/*
+		 * Reply only if tip is configured on the incoming interface
+		 * and is in same subnet as sip
+		 */
+		scope = RT_SCOPE_HOST;
+		break;
+	case 3:	/* Do not reply for scope host addresses */
+		sip = 0;
+		scope = RT_SCOPE_LINK;
+		dev = NULL;
+		break;
+	case 4:	/* Reserved */
+	case 5:
+	case 6:
+	case 7:
+		return 0;
+	case 8:	/* Do not reply */
+		return 1;
+	default:
+		return 0;
+	}
+	return !inet_confirm_addr(dev, sip, tip, scope);
+}
+
+static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev)
+{
+	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip,
+						 .saddr = tip } } };
+	struct rtable *rt;
+	int flag = 0; 
+	/*unsigned long now; */
+
+	if (ip_route_output_key(&rt, &fl) < 0) 
+		return 1;
+	if (rt->u.dst.dev != dev) { 
+		NET_INC_STATS_BH(LINUX_MIB_ARPFILTER);
+		flag = 1;
+	} 
+	ip_rt_put(rt); 
+	return flag; 
+} 
+
+/* OBSOLETE FUNCTIONS */
+
+/*
+ *	Find an arp mapping in the cache. If not found, post a request.
+ *
+ *	It is very UGLY routine: it DOES NOT use skb->dst->neighbour,
+ *	even if it exists. It is supposed that skb->dev was mangled
+ *	by a virtual device (eql, shaper). Nobody but broken devices
+ *	is allowed to use this function, it is scheduled to be removed. --ANK
+ */
+
+static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev)
+{
+	switch (addr_hint) {
+	case RTN_LOCAL:
+		printk(KERN_DEBUG "ARP: arp called for own IP address\n");
+		memcpy(haddr, dev->dev_addr, dev->addr_len);
+		return 1;
+	case RTN_MULTICAST:
+		arp_mc_map(paddr, haddr, dev, 1);
+		return 1;
+	case RTN_BROADCAST:
+		memcpy(haddr, dev->broadcast, dev->addr_len);
+		return 1;
+	}
+	return 0;
+}
+
+
+int arp_find(unsigned char *haddr, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	u32 paddr;
+	struct neighbour *n;
+
+	if (!skb->dst) {
+		printk(KERN_DEBUG "arp_find is called with dst==NULL\n");
+		kfree_skb(skb);
+		return 1;
+	}
+
+	paddr = ((struct rtable*)skb->dst)->rt_gateway;
+
+	if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev))
+		return 0;
+
+	n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
+
+	if (n) {
+		n->used = jiffies;
+		if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
+			read_lock_bh(&n->lock);
+ 			memcpy(haddr, n->ha, dev->addr_len);
+			read_unlock_bh(&n->lock);
+			neigh_release(n);
+			return 0;
+		}
+		neigh_release(n);
+	} else
+		kfree_skb(skb);
+	return 1;
+}
+
+/* END OF OBSOLETE FUNCTIONS */
+
+int arp_bind_neighbour(struct dst_entry *dst)
+{
+	struct net_device *dev = dst->dev;
+	struct neighbour *n = dst->neighbour;
+
+	if (dev == NULL)
+		return -EINVAL;
+	if (n == NULL) {
+		u32 nexthop = ((struct rtable*)dst)->rt_gateway;
+		if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+			nexthop = 0;
+		n = __neigh_lookup_errno(
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+		    dev->type == ARPHRD_ATM ? clip_tbl_hook :
+#endif
+		    &arp_tbl, &nexthop, dev);
+		if (IS_ERR(n))
+			return PTR_ERR(n);
+		dst->neighbour = n;
+	}
+	return 0;
+}
+
+/*
+ * Check if we can use proxy ARP for this path
+ */
+
+static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+{
+	struct in_device *out_dev;
+	int imi, omi = -1;
+
+	if (!IN_DEV_PROXY_ARP(in_dev))
+		return 0;
+
+	if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+		return 1;
+	if (imi == -1)
+		return 0;
+
+	/* place to check for proxy_arp for routes */
+
+	if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) {
+		omi = IN_DEV_MEDIUM_ID(out_dev);
+		in_dev_put(out_dev);
+	}
+	return (omi != imi && omi != -1);
+}
+
+/*
+ *	Interface to link layer: send routine and receive handler.
+ */
+
+/*
+ *	Create an arp packet. If (dest_hw == NULL), we create a broadcast
+ *	message.
+ */
+struct sk_buff *arp_create(int type, int ptype, u32 dest_ip,
+			   struct net_device *dev, u32 src_ip,
+			   unsigned char *dest_hw, unsigned char *src_hw,
+			   unsigned char *target_hw)
+{
+	struct sk_buff *skb;
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+
+	/*
+	 *	Allocate a buffer
+	 */
+	
+	skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4)
+				+ LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+	skb->nh.raw = skb->data;
+	arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4));
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_ARP);
+	if (src_hw == NULL)
+		src_hw = dev->dev_addr;
+	if (dest_hw == NULL)
+		dest_hw = dev->broadcast;
+
+	/*
+	 *	Fill the device header for the ARP frame
+	 */
+	if (dev->hard_header &&
+	    dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0)
+		goto out;
+
+	/*
+	 * Fill out the arp protocol part.
+	 *
+	 * The arp hardware type should match the device type, except for FDDI,
+	 * which (according to RFC 1390) should always equal 1 (Ethernet).
+	 */
+	/*
+	 *	Exceptions everywhere. AX.25 uses the AX.25 PID value not the
+	 *	DIX code for the protocol. Make these device structure fields.
+	 */
+	switch (dev->type) {
+	default:
+		arp->ar_hrd = htons(dev->type);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	case ARPHRD_AX25:
+		arp->ar_hrd = htons(ARPHRD_AX25);
+		arp->ar_pro = htons(AX25_P_IP);
+		break;
+
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+	case ARPHRD_NETROM:
+		arp->ar_hrd = htons(ARPHRD_NETROM);
+		arp->ar_pro = htons(AX25_P_IP);
+		break;
+#endif
+#endif
+
+#ifdef CONFIG_FDDI
+	case ARPHRD_FDDI:
+		arp->ar_hrd = htons(ARPHRD_ETHER);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+#endif
+#ifdef CONFIG_TR
+	case ARPHRD_IEEE802_TR:
+		arp->ar_hrd = htons(ARPHRD_IEEE802);
+		arp->ar_pro = htons(ETH_P_IP);
+		break;
+#endif
+	}
+
+	arp->ar_hln = dev->addr_len;
+	arp->ar_pln = 4;
+	arp->ar_op = htons(type);
+
+	arp_ptr=(unsigned char *)(arp+1);
+
+	memcpy(arp_ptr, src_hw, dev->addr_len);
+	arp_ptr+=dev->addr_len;
+	memcpy(arp_ptr, &src_ip,4);
+	arp_ptr+=4;
+	if (target_hw != NULL)
+		memcpy(arp_ptr, target_hw, dev->addr_len);
+	else
+		memset(arp_ptr, 0, dev->addr_len);
+	arp_ptr+=dev->addr_len;
+	memcpy(arp_ptr, &dest_ip, 4);
+
+	return skb;
+
+out:
+	kfree_skb(skb);
+	return NULL;
+}
+
+/*
+ *	Send an arp packet.
+ */
+void arp_xmit(struct sk_buff *skb)
+{
+	/* Send it off, maybe filter it using firewalling first.  */
+	NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit);
+}
+
+/*
+ *	Create and send an arp packet.
+ */
+void arp_send(int type, int ptype, u32 dest_ip, 
+	      struct net_device *dev, u32 src_ip, 
+	      unsigned char *dest_hw, unsigned char *src_hw,
+	      unsigned char *target_hw)
+{
+	struct sk_buff *skb;
+
+	/*
+	 *	No arp on this interface.
+	 */
+	
+	if (dev->flags&IFF_NOARP)
+		return;
+
+	skb = arp_create(type, ptype, dest_ip, dev, src_ip,
+			 dest_hw, src_hw, target_hw);
+	if (skb == NULL) {
+		return;
+	}
+
+	arp_xmit(skb);
+}
+
+static void parp_redo(struct sk_buff *skb)
+{
+	nf_reset(skb);
+	arp_rcv(skb, skb->dev, NULL);
+}
+
+/*
+ *	Process an arp request.
+ */
+
+static int arp_process(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev = in_dev_get(dev);
+	struct arphdr *arp;
+	unsigned char *arp_ptr;
+	struct rtable *rt;
+	unsigned char *sha, *tha;
+	u32 sip, tip;
+	u16 dev_type = dev->type;
+	int addr_type;
+	struct neighbour *n;
+
+	/* arp_rcv below verifies the ARP header and verifies the device
+	 * is ARP'able.
+	 */
+
+	if (in_dev == NULL)
+		goto out;
+
+	arp = skb->nh.arph;
+
+	switch (dev_type) {
+	default:	
+		if (arp->ar_pro != htons(ETH_P_IP) ||
+		    htons(dev_type) != arp->ar_hrd)
+			goto out;
+		break;
+#ifdef CONFIG_NET_ETHERNET
+	case ARPHRD_ETHER:
+#endif
+#ifdef CONFIG_TR
+	case ARPHRD_IEEE802_TR:
+#endif
+#ifdef CONFIG_FDDI
+	case ARPHRD_FDDI:
+#endif
+#ifdef CONFIG_NET_FC
+	case ARPHRD_IEEE802:
+#endif
+#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \
+    defined(CONFIG_FDDI)	 || defined(CONFIG_NET_FC)
+		/*
+		 * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802
+		 * devices, according to RFC 2625) devices will accept ARP
+		 * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2).
+		 * This is the case also of FDDI, where the RFC 1390 says that
+		 * FDDI devices should accept ARP hardware of (1) Ethernet,
+		 * however, to be more robust, we'll accept both 1 (Ethernet)
+		 * or 6 (IEEE 802.2)
+		 */
+		if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
+		     arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
+		    arp->ar_pro != htons(ETH_P_IP))
+			goto out;
+		break;
+#endif
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	case ARPHRD_AX25:
+		if (arp->ar_pro != htons(AX25_P_IP) ||
+		    arp->ar_hrd != htons(ARPHRD_AX25))
+			goto out;
+		break;
+#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
+	case ARPHRD_NETROM:
+		if (arp->ar_pro != htons(AX25_P_IP) ||
+		    arp->ar_hrd != htons(ARPHRD_NETROM))
+			goto out;
+		break;
+#endif
+#endif
+	}
+
+	/* Understand only these message types */
+
+	if (arp->ar_op != htons(ARPOP_REPLY) &&
+	    arp->ar_op != htons(ARPOP_REQUEST))
+		goto out;
+
+/*
+ *	Extract fields
+ */
+	arp_ptr= (unsigned char *)(arp+1);
+	sha	= arp_ptr;
+	arp_ptr += dev->addr_len;
+	memcpy(&sip, arp_ptr, 4);
+	arp_ptr += 4;
+	tha	= arp_ptr;
+	arp_ptr += dev->addr_len;
+	memcpy(&tip, arp_ptr, 4);
+/* 
+ *	Check for bad requests for 127.x.x.x and requests for multicast
+ *	addresses.  If this is one such, delete it.
+ */
+	if (LOOPBACK(tip) || MULTICAST(tip))
+		goto out;
+
+/*
+ *     Special case: We must set Frame Relay source Q.922 address
+ */
+	if (dev_type == ARPHRD_DLCI)
+		sha = dev->broadcast;
+
+/*
+ *  Process entry.  The idea here is we want to send a reply if it is a
+ *  request for us or if it is a request for someone else that we hold
+ *  a proxy for.  We want to add an entry to our cache if it is a reply
+ *  to us or if it is a request for our address.  
+ *  (The assumption for this last is that if someone is requesting our 
+ *  address, they are probably intending to talk to us, so it saves time 
+ *  if we cache their address.  Their address is also probably not in 
+ *  our cache, since ours is not in their cache.)
+ * 
+ *  Putting this another way, we only care about replies if they are to
+ *  us, in which case we add them to the cache.  For requests, we care
+ *  about those for us and those for our proxies.  We reply to both,
+ *  and in the case of requests for us we add the requester to the arp 
+ *  cache.
+ */
+
+	/* Special case: IPv4 duplicate address detection packet (RFC2131) */
+	if (sip == 0) {
+		if (arp->ar_op == htons(ARPOP_REQUEST) &&
+		    inet_addr_type(tip) == RTN_LOCAL &&
+		    !arp_ignore(in_dev,dev,sip,tip))
+			arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr);
+		goto out;
+	}
+
+	if (arp->ar_op == htons(ARPOP_REQUEST) &&
+	    ip_route_input(skb, tip, sip, 0, dev) == 0) {
+
+		rt = (struct rtable*)skb->dst;
+		addr_type = rt->rt_type;
+
+		if (addr_type == RTN_LOCAL) {
+			n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+			if (n) {
+				int dont_send = 0;
+
+				if (!dont_send)
+					dont_send |= arp_ignore(in_dev,dev,sip,tip);
+				if (!dont_send && IN_DEV_ARPFILTER(in_dev))
+					dont_send |= arp_filter(sip,tip,dev); 
+				if (!dont_send)
+					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+
+				neigh_release(n);
+			}
+			goto out;
+		} else if (IN_DEV_FORWARD(in_dev)) {
+			if ((rt->rt_flags&RTCF_DNAT) ||
+			    (addr_type == RTN_UNICAST  && rt->u.dst.dev != dev &&
+			     (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) {
+				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
+				if (n)
+					neigh_release(n);
+
+				if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || 
+				    skb->pkt_type == PACKET_HOST ||
+				    in_dev->arp_parms->proxy_delay == 0) {
+					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+				} else {
+					pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+					in_dev_put(in_dev);
+					return 0;
+				}
+				goto out;
+			}
+		}
+	}
+
+	/* Update our ARP tables */
+
+	n = __neigh_lookup(&arp_tbl, &sip, dev, 0);
+
+#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP
+	/* Unsolicited ARP is not accepted by default.
+	   It is possible, that this option should be enabled for some
+	   devices (strip is candidate)
+	 */
+	if (n == NULL &&
+	    arp->ar_op == htons(ARPOP_REPLY) &&
+	    inet_addr_type(sip) == RTN_UNICAST)
+		n = __neigh_lookup(&arp_tbl, &sip, dev, -1);
+#endif
+
+	if (n) {
+		int state = NUD_REACHABLE;
+		int override;
+
+		/* If several different ARP replies follows back-to-back,
+		   use the FIRST one. It is possible, if several proxy
+		   agents are active. Taking the first reply prevents
+		   arp trashing and chooses the fastest router.
+		 */
+		override = time_after(jiffies, n->updated + n->parms->locktime);
+
+		/* Broadcast replies and request packets
+		   do not assert neighbour reachability.
+		 */
+		if (arp->ar_op != htons(ARPOP_REPLY) ||
+		    skb->pkt_type != PACKET_HOST)
+			state = NUD_STALE;
+		neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+		neigh_release(n);
+	}
+
+out:
+	if (in_dev)
+		in_dev_put(in_dev);
+	kfree_skb(skb);
+	return 0;
+}
+
+
+/*
+ *	Receive an arp request from the device layer.
+ */
+
+int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+	struct arphdr *arp;
+
+	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+	if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
+				 (2 * dev->addr_len) +
+				 (2 * sizeof(u32)))))
+		goto freeskb;
+
+	arp = skb->nh.arph;
+	if (arp->ar_hln != dev->addr_len ||
+	    dev->flags & IFF_NOARP ||
+	    skb->pkt_type == PACKET_OTHERHOST ||
+	    skb->pkt_type == PACKET_LOOPBACK ||
+	    arp->ar_pln != 4)
+		goto freeskb;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		goto out_of_mem;
+
+	return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process);
+
+freeskb:
+	kfree_skb(skb);
+out_of_mem:
+	return 0;
+}
+
+/*
+ *	User level interface (ioctl)
+ */
+
+/*
+ *	Set (create) an ARP cache entry.
+ */
+
+static int arp_req_set(struct arpreq *r, struct net_device * dev)
+{
+	u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+	struct neighbour *neigh;
+	int err;
+
+	if (r->arp_flags&ATF_PUBL) {
+		u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr;
+		if (mask && mask != 0xFFFFFFFF)
+			return -EINVAL;
+		if (!dev && (r->arp_flags & ATF_COM)) {
+			dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data);
+			if (!dev)
+				return -ENODEV;
+		}
+		if (mask) {
+			if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL)
+				return -ENOBUFS;
+			return 0;
+		}
+		if (dev == NULL) {
+			ipv4_devconf.proxy_arp = 1;
+			return 0;
+		}
+		if (__in_dev_get(dev)) {
+			__in_dev_get(dev)->cnf.proxy_arp = 1;
+			return 0;
+		}
+		return -ENXIO;
+	}
+
+	if (r->arp_flags & ATF_PERM)
+		r->arp_flags |= ATF_COM;
+	if (dev == NULL) {
+		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+							 .tos = RTO_ONLINK } } };
+		struct rtable * rt;
+		if ((err = ip_route_output_key(&rt, &fl)) != 0)
+			return err;
+		dev = rt->u.dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
+	switch (dev->type) {
+#ifdef CONFIG_FDDI
+	case ARPHRD_FDDI:
+		/*
+		 * According to RFC 1390, FDDI devices should accept ARP
+		 * hardware types of 1 (Ethernet).  However, to be more
+		 * robust, we'll accept hardware types of either 1 (Ethernet)
+		 * or 6 (IEEE 802.2).
+		 */
+		if (r->arp_ha.sa_family != ARPHRD_FDDI &&
+		    r->arp_ha.sa_family != ARPHRD_ETHER &&
+		    r->arp_ha.sa_family != ARPHRD_IEEE802)
+			return -EINVAL;
+		break;
+#endif
+	default:
+		if (r->arp_ha.sa_family != dev->type)
+			return -EINVAL;
+		break;
+	}
+
+	neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev);
+	err = PTR_ERR(neigh);
+	if (!IS_ERR(neigh)) {
+		unsigned state = NUD_STALE;
+		if (r->arp_flags & ATF_PERM)
+			state = NUD_PERMANENT;
+		err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+				   r->arp_ha.sa_data : NULL, state, 
+				   NEIGH_UPDATE_F_OVERRIDE|
+				   NEIGH_UPDATE_F_ADMIN);
+		neigh_release(neigh);
+	}
+	return err;
+}
+
+static unsigned arp_state_to_flags(struct neighbour *neigh)
+{
+	unsigned flags = 0;
+	if (neigh->nud_state&NUD_PERMANENT)
+		flags = ATF_PERM|ATF_COM;
+	else if (neigh->nud_state&NUD_VALID)
+		flags = ATF_COM;
+	return flags;
+}
+
+/*
+ *	Get an ARP cache entry.
+ */
+
+static int arp_req_get(struct arpreq *r, struct net_device *dev)
+{
+	u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr;
+	struct neighbour *neigh;
+	int err = -ENXIO;
+
+	neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	if (neigh) {
+		read_lock_bh(&neigh->lock);
+		memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len);
+		r->arp_flags = arp_state_to_flags(neigh);
+		read_unlock_bh(&neigh->lock);
+		r->arp_ha.sa_family = dev->type;
+		strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev));
+		neigh_release(neigh);
+		err = 0;
+	}
+	return err;
+}
+
+static int arp_req_delete(struct arpreq *r, struct net_device * dev)
+{
+	int err;
+	u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
+	struct neighbour *neigh;
+
+	if (r->arp_flags & ATF_PUBL) {
+		u32 mask =
+		       ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr;
+		if (mask == 0xFFFFFFFF)
+			return pneigh_delete(&arp_tbl, &ip, dev);
+		if (mask == 0) {
+			if (dev == NULL) {
+				ipv4_devconf.proxy_arp = 0;
+				return 0;
+			}
+			if (__in_dev_get(dev)) {
+				__in_dev_get(dev)->cnf.proxy_arp = 0;
+				return 0;
+			}
+			return -ENXIO;
+		}
+		return -EINVAL;
+	}
+
+	if (dev == NULL) {
+		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
+							 .tos = RTO_ONLINK } } };
+		struct rtable * rt;
+		if ((err = ip_route_output_key(&rt, &fl)) != 0)
+			return err;
+		dev = rt->u.dst.dev;
+		ip_rt_put(rt);
+		if (!dev)
+			return -EINVAL;
+	}
+	err = -ENXIO;
+	neigh = neigh_lookup(&arp_tbl, &ip, dev);
+	if (neigh) {
+		if (neigh->nud_state&~NUD_NOARP)
+			err = neigh_update(neigh, NULL, NUD_FAILED, 
+					   NEIGH_UPDATE_F_OVERRIDE|
+					   NEIGH_UPDATE_F_ADMIN);
+		neigh_release(neigh);
+	}
+	return err;
+}
+
+/*
+ *	Handle an ARP layer I/O control request.
+ */
+
+int arp_ioctl(unsigned int cmd, void __user *arg)
+{
+	int err;
+	struct arpreq r;
+	struct net_device *dev = NULL;
+
+	switch (cmd) {
+		case SIOCDARP:
+		case SIOCSARP:
+			if (!capable(CAP_NET_ADMIN))
+				return -EPERM;
+		case SIOCGARP:
+			err = copy_from_user(&r, arg, sizeof(struct arpreq));
+			if (err)
+				return -EFAULT;
+			break;
+		default:
+			return -EINVAL;
+	}
+
+	if (r.arp_pa.sa_family != AF_INET)
+		return -EPFNOSUPPORT;
+
+	if (!(r.arp_flags & ATF_PUBL) &&
+	    (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+		return -EINVAL;
+	if (!(r.arp_flags & ATF_NETMASK))
+		((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
+							   htonl(0xFFFFFFFFUL);
+	rtnl_lock();
+	if (r.arp_dev[0]) {
+		err = -ENODEV;
+		if ((dev = __dev_get_by_name(r.arp_dev)) == NULL)
+			goto out;
+
+		/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
+		if (!r.arp_ha.sa_family)
+			r.arp_ha.sa_family = dev->type;
+		err = -EINVAL;
+		if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type)
+			goto out;
+	} else if (cmd == SIOCGARP) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	switch(cmd) {
+	case SIOCDARP:
+	        err = arp_req_delete(&r, dev);
+		break;
+	case SIOCSARP:
+		err = arp_req_set(&r, dev);
+		break;
+	case SIOCGARP:
+		err = arp_req_get(&r, dev);
+		if (!err && copy_to_user(arg, &r, sizeof(r)))
+			err = -EFAULT;
+		break;
+	}
+out:
+	rtnl_unlock();
+	return err;
+}
+
+static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	switch (event) {
+	case NETDEV_CHANGEADDR:
+		neigh_changeaddr(&arp_tbl, dev);
+		rt_cache_flush(0);
+		break;
+	default:
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block arp_netdev_notifier = {
+	.notifier_call = arp_netdev_event,
+};
+
+/* Note, that it is not on notifier chain.
+   It is necessary, that this routine was called after route cache will be
+   flushed.
+ */
+void arp_ifdown(struct net_device *dev)
+{
+	neigh_ifdown(&arp_tbl, dev);
+}
+
+
+/*
+ *	Called once on startup.
+ */
+
+static struct packet_type arp_packet_type = {
+	.type =	__constant_htons(ETH_P_ARP),
+	.func =	arp_rcv,
+};
+
+static int arp_proc_init(void);
+
+void __init arp_init(void)
+{
+	neigh_table_init(&arp_tbl);
+
+	dev_add_pack(&arp_packet_type);
+	arp_proc_init();
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
+			      NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+	register_netdevice_notifier(&arp_netdev_notifier);
+}
+
+#ifdef CONFIG_PROC_FS
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+
+/* ------------------------------------------------------------------------ */
+/*
+ *	ax25 -> ASCII conversion
+ */
+static char *ax2asc2(ax25_address *a, char *buf)
+{
+	char c, *s;
+	int n;
+
+	for (n = 0, s = buf; n < 6; n++) {
+		c = (a->ax25_call[n] >> 1) & 0x7F;
+
+		if (c != ' ') *s++ = c;
+	}
+	
+	*s++ = '-';
+
+	if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+		*s++ = '1';
+		n -= 10;
+	}
+	
+	*s++ = n + '0';
+	*s++ = '\0';
+
+	if (*buf == '\0' || *buf == '-')
+	   return "*";
+
+	return buf;
+
+}
+#endif /* CONFIG_AX25 */
+
+#define HBUFFERLEN 30
+
+static void arp_format_neigh_entry(struct seq_file *seq,
+				   struct neighbour *n)
+{
+	char hbuffer[HBUFFERLEN];
+	const char hexbuf[] = "0123456789ABCDEF";
+	int k, j;
+	char tbuf[16];
+	struct net_device *dev = n->dev;
+	int hatype = dev->type;
+
+	read_lock(&n->lock);
+	/* Convert hardware address to XX:XX:XX:XX ... form. */
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM)
+		ax2asc2((ax25_address *)n->ha, hbuffer);
+	else {
+#endif
+	for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) {
+		hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15];
+		hbuffer[k++] = hexbuf[n->ha[j] & 15];
+		hbuffer[k++] = ':';
+	}
+	hbuffer[--k] = 0;
+#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
+	}
+#endif
+	sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key));
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+		   tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name);
+	read_unlock(&n->lock);
+}
+
+static void arp_format_pneigh_entry(struct seq_file *seq,
+				    struct pneigh_entry *n)
+{
+	struct net_device *dev = n->dev;
+	int hatype = dev ? dev->type : 0;
+	char tbuf[16];
+
+	sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key));
+	seq_printf(seq, "%-16s 0x%-10x0x%-10x%s     *        %s\n",
+		   tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00",
+		   dev ? dev->name : "*");
+}
+
+static int arp_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, "IP address       HW type     Flags       "
+			      "HW address            Mask     Device\n");
+	} else {
+		struct neigh_seq_state *state = seq->private;
+
+		if (state->flags & NEIGH_SEQ_IS_PNEIGH)
+			arp_format_pneigh_entry(seq, v);
+		else
+			arp_format_neigh_entry(seq, v);
+	}
+
+	return 0;
+}
+
+static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	/* Don't want to confuse "arp -a" w/ magic entries,
+	 * so we tell the generic iterator to skip NUD_NOARP.
+	 */
+	return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP);
+}
+
+/* ------------------------------------------------------------------------ */
+
+static struct seq_operations arp_seq_ops = {
+	.start  = arp_seq_start,
+	.next   = neigh_seq_next,
+	.stop   = neigh_seq_stop,
+	.show   = arp_seq_show,
+};
+
+static int arp_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+	if (!s)
+		goto out;
+
+	memset(s, 0, sizeof(*s));
+	rc = seq_open(file, &arp_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq	     = file->private_data;
+	seq->private = s;
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations arp_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = arp_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+static int __init arp_proc_init(void)
+{
+	if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+#else /* CONFIG_PROC_FS */
+
+static int __init arp_proc_init(void)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(arp_broken_ops);
+EXPORT_SYMBOL(arp_find);
+EXPORT_SYMBOL(arp_rcv);
+EXPORT_SYMBOL(arp_create);
+EXPORT_SYMBOL(arp_xmit);
+EXPORT_SYMBOL(arp_send);
+EXPORT_SYMBOL(arp_tbl);
+
+#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
+EXPORT_SYMBOL(clip_tbl_hook);
+#endif
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
new file mode 100644
index 000000000000..b1db561f2542
--- /dev/null
+++ b/net/ipv4/datagram.c
@@ -0,0 +1,73 @@
+/*
+ *	common UDP/RAW code
+ *	Linux INET implementation
+ *
+ * Authors:
+ * 	Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>
+ *
+ * 	This program is free software; you can redistribute it and/or
+ * 	modify it under the terms of the GNU General Public License
+ * 	as published by the Free Software Foundation; either version
+ * 	2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
+	struct rtable *rt;
+	u32 saddr;
+	int oif;
+	int err;
+
+	
+	if (addr_len < sizeof(*usin)) 
+	  	return -EINVAL;
+
+	if (usin->sin_family != AF_INET) 
+	  	return -EAFNOSUPPORT;
+
+	sk_dst_reset(sk);
+
+	oif = sk->sk_bound_dev_if;
+	saddr = inet->saddr;
+	if (MULTICAST(usin->sin_addr.s_addr)) {
+		if (!oif)
+			oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	}
+	err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
+			       RT_CONN_FLAGS(sk), oif,
+			       sk->sk_protocol,
+			       inet->sport, usin->sin_port, sk);
+	if (err)
+		return err;
+	if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) {
+		ip_rt_put(rt);
+		return -EACCES;
+	}
+  	if (!inet->saddr)
+	  	inet->saddr = rt->rt_src;	/* Update source address */
+	if (!inet->rcv_saddr)
+		inet->rcv_saddr = rt->rt_src;
+	inet->daddr = rt->rt_dst;
+	inet->dport = usin->sin_port;
+	sk->sk_state = TCP_ESTABLISHED;
+	inet->id = jiffies;
+
+	sk_dst_set(sk, &rt->u.dst);
+	return(0);
+}
+
+EXPORT_SYMBOL(ip4_datagram_connect);
+
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
new file mode 100644
index 000000000000..eea7ef010776
--- /dev/null
+++ b/net/ipv4/devinet.c
@@ -0,0 +1,1508 @@
+/*
+ *	NET3	IP device support routines.
+ *
+ *	Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ *	Derived from the IP parts of dev.c 1.0.19
+ * 		Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *				Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *				Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *
+ *	Additional Authors:
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	Changes:
+ *		Alexey Kuznetsov:	pa_* fields are replaced with ifaddr
+ *					lists.
+ *		Cyrus Durgin:		updated for kmod
+ *		Matthias Andree:	in devinet_ioctl, compare label and
+ *					address (4.4BSD alias style support),
+ *					fall back to comparing just the label
+ *					if no match found.
+ */
+
+#include <linux/config.h>
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/mm.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/if_ether.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <linux/kmod.h>
+
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+
+struct ipv4_devconf ipv4_devconf = {
+	.accept_redirects = 1,
+	.send_redirects =  1,
+	.secure_redirects = 1,
+	.shared_media =	  1,
+};
+
+static struct ipv4_devconf ipv4_devconf_dflt = {
+	.accept_redirects =  1,
+	.send_redirects =    1,
+	.secure_redirects =  1,
+	.shared_media =	     1,
+	.accept_source_route = 1,
+};
+
+static void rtmsg_ifa(int event, struct in_ifaddr *);
+
+static struct notifier_block *inetaddr_chain;
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+			 int destroy);
+#ifdef CONFIG_SYSCTL
+static void devinet_sysctl_register(struct in_device *in_dev,
+				    struct ipv4_devconf *p);
+static void devinet_sysctl_unregister(struct ipv4_devconf *p);
+#endif
+
+/* Locks all the inet devices. */
+
+static struct in_ifaddr *inet_alloc_ifa(void)
+{
+	struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL);
+
+	if (ifa) {
+		memset(ifa, 0, sizeof(*ifa));
+		INIT_RCU_HEAD(&ifa->rcu_head);
+	}
+
+	return ifa;
+}
+
+static void inet_rcu_free_ifa(struct rcu_head *head)
+{
+	struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head);
+	if (ifa->ifa_dev)
+		in_dev_put(ifa->ifa_dev);
+	kfree(ifa);
+}
+
+static inline void inet_free_ifa(struct in_ifaddr *ifa)
+{
+	call_rcu(&ifa->rcu_head, inet_rcu_free_ifa);
+}
+
+void in_dev_finish_destroy(struct in_device *idev)
+{
+	struct net_device *dev = idev->dev;
+
+	BUG_TRAP(!idev->ifa_list);
+	BUG_TRAP(!idev->mc_list);
+#ifdef NET_REFCNT_DEBUG
+	printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n",
+	       idev, dev ? dev->name : "NIL");
+#endif
+	dev_put(dev);
+	if (!idev->dead)
+		printk("Freeing alive in_device %p\n", idev);
+	else {
+		kfree(idev);
+	}
+}
+
+struct in_device *inetdev_init(struct net_device *dev)
+{
+	struct in_device *in_dev;
+
+	ASSERT_RTNL();
+
+	in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL);
+	if (!in_dev)
+		goto out;
+	memset(in_dev, 0, sizeof(*in_dev));
+	INIT_RCU_HEAD(&in_dev->rcu_head);
+	memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf));
+	in_dev->cnf.sysctl = NULL;
+	in_dev->dev = dev;
+	if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+		goto out_kfree;
+	/* Reference in_dev->dev */
+	dev_hold(dev);
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+			      NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+#endif
+
+	/* Account for reference dev->ip_ptr */
+	in_dev_hold(in_dev);
+	rcu_assign_pointer(dev->ip_ptr, in_dev);
+
+#ifdef CONFIG_SYSCTL
+	devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+	ip_mc_init_dev(in_dev);
+	if (dev->flags & IFF_UP)
+		ip_mc_up(in_dev);
+out:
+	return in_dev;
+out_kfree:
+	kfree(in_dev);
+	in_dev = NULL;
+	goto out;
+}
+
+static void in_dev_rcu_put(struct rcu_head *head)
+{
+	struct in_device *idev = container_of(head, struct in_device, rcu_head);
+	in_dev_put(idev);
+}
+
+static void inetdev_destroy(struct in_device *in_dev)
+{
+	struct in_ifaddr *ifa;
+	struct net_device *dev;
+
+	ASSERT_RTNL();
+
+	dev = in_dev->dev;
+	if (dev == &loopback_dev)
+		return;
+
+	in_dev->dead = 1;
+
+	ip_mc_destroy_dev(in_dev);
+
+	while ((ifa = in_dev->ifa_list) != NULL) {
+		inet_del_ifa(in_dev, &in_dev->ifa_list, 0);
+		inet_free_ifa(ifa);
+	}
+
+#ifdef CONFIG_SYSCTL
+	devinet_sysctl_unregister(&in_dev->cnf);
+#endif
+
+	dev->ip_ptr = NULL;
+
+#ifdef CONFIG_SYSCTL
+	neigh_sysctl_unregister(in_dev->arp_parms);
+#endif
+	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
+	arp_ifdown(dev);
+
+	call_rcu(&in_dev->rcu_head, in_dev_rcu_put);
+}
+
+int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b)
+{
+	rcu_read_lock();
+	for_primary_ifa(in_dev) {
+		if (inet_ifa_match(a, ifa)) {
+			if (!b || inet_ifa_match(b, ifa)) {
+				rcu_read_unlock();
+				return 1;
+			}
+		}
+	} endfor_ifa(in_dev);
+	rcu_read_unlock();
+	return 0;
+}
+
+static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
+			 int destroy)
+{
+	struct in_ifaddr *ifa1 = *ifap;
+
+	ASSERT_RTNL();
+
+	/* 1. Deleting primary ifaddr forces deletion all secondaries */
+
+	if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) {
+		struct in_ifaddr *ifa;
+		struct in_ifaddr **ifap1 = &ifa1->ifa_next;
+
+		while ((ifa = *ifap1) != NULL) {
+			if (!(ifa->ifa_flags & IFA_F_SECONDARY) ||
+			    ifa1->ifa_mask != ifa->ifa_mask ||
+			    !inet_ifa_match(ifa1->ifa_address, ifa)) {
+				ifap1 = &ifa->ifa_next;
+				continue;
+			}
+
+			*ifap1 = ifa->ifa_next;
+
+			rtmsg_ifa(RTM_DELADDR, ifa);
+			notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa);
+			inet_free_ifa(ifa);
+		}
+	}
+
+	/* 2. Unlink it */
+
+	*ifap = ifa1->ifa_next;
+
+	/* 3. Announce address deletion */
+
+	/* Send message first, then call notifier.
+	   At first sight, FIB update triggered by notifier
+	   will refer to already deleted ifaddr, that could confuse
+	   netlink listeners. It is not true: look, gated sees
+	   that route deleted and if it still thinks that ifaddr
+	   is valid, it will try to restore deleted routes... Grr.
+	   So that, this order is correct.
+	 */
+	rtmsg_ifa(RTM_DELADDR, ifa1);
+	notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1);
+	if (destroy) {
+		inet_free_ifa(ifa1);
+
+		if (!in_dev->ifa_list)
+			inetdev_destroy(in_dev);
+	}
+}
+
+static int inet_insert_ifa(struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct in_ifaddr *ifa1, **ifap, **last_primary;
+
+	ASSERT_RTNL();
+
+	if (!ifa->ifa_local) {
+		inet_free_ifa(ifa);
+		return 0;
+	}
+
+	ifa->ifa_flags &= ~IFA_F_SECONDARY;
+	last_primary = &in_dev->ifa_list;
+
+	for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL;
+	     ifap = &ifa1->ifa_next) {
+		if (!(ifa1->ifa_flags & IFA_F_SECONDARY) &&
+		    ifa->ifa_scope <= ifa1->ifa_scope)
+			last_primary = &ifa1->ifa_next;
+		if (ifa1->ifa_mask == ifa->ifa_mask &&
+		    inet_ifa_match(ifa1->ifa_address, ifa)) {
+			if (ifa1->ifa_local == ifa->ifa_local) {
+				inet_free_ifa(ifa);
+				return -EEXIST;
+			}
+			if (ifa1->ifa_scope != ifa->ifa_scope) {
+				inet_free_ifa(ifa);
+				return -EINVAL;
+			}
+			ifa->ifa_flags |= IFA_F_SECONDARY;
+		}
+	}
+
+	if (!(ifa->ifa_flags & IFA_F_SECONDARY)) {
+		net_srandom(ifa->ifa_local);
+		ifap = last_primary;
+	}
+
+	ifa->ifa_next = *ifap;
+	*ifap = ifa;
+
+	/* Send message first, then call notifier.
+	   Notifier will trigger FIB update, so that
+	   listeners of netlink will know about new ifaddr */
+	rtmsg_ifa(RTM_NEWADDR, ifa);
+	notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa);
+
+	return 0;
+}
+
+static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = __in_dev_get(dev);
+
+	ASSERT_RTNL();
+
+	if (!in_dev) {
+		in_dev = inetdev_init(dev);
+		if (!in_dev) {
+			inet_free_ifa(ifa);
+			return -ENOBUFS;
+		}
+	}
+	if (ifa->ifa_dev != in_dev) {
+		BUG_TRAP(!ifa->ifa_dev);
+		in_dev_hold(in_dev);
+		ifa->ifa_dev = in_dev;
+	}
+	if (LOOPBACK(ifa->ifa_local))
+		ifa->ifa_scope = RT_SCOPE_HOST;
+	return inet_insert_ifa(ifa);
+}
+
+struct in_device *inetdev_by_index(int ifindex)
+{
+	struct net_device *dev;
+	struct in_device *in_dev = NULL;
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_index(ifindex);
+	if (dev)
+		in_dev = in_dev_get(dev);
+	read_unlock(&dev_base_lock);
+	return in_dev;
+}
+
+/* Called only from RTNL semaphored context. No locks. */
+
+struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix,
+				    u32 mask)
+{
+	ASSERT_RTNL();
+
+	for_primary_ifa(in_dev) {
+		if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa))
+			return ifa;
+	} endfor_ifa(in_dev);
+	return NULL;
+}
+
+static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct rtattr **rta = arg;
+	struct in_device *in_dev;
+	struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+	struct in_ifaddr *ifa, **ifap;
+
+	ASSERT_RTNL();
+
+	if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL)
+		goto out;
+	__in_dev_put(in_dev);
+
+	for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+	     ifap = &ifa->ifa_next) {
+		if ((rta[IFA_LOCAL - 1] &&
+		     memcmp(RTA_DATA(rta[IFA_LOCAL - 1]),
+			    &ifa->ifa_local, 4)) ||
+		    (rta[IFA_LABEL - 1] &&
+		     rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) ||
+		    (rta[IFA_ADDRESS - 1] &&
+		     (ifm->ifa_prefixlen != ifa->ifa_prefixlen ||
+		      !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]),
+			      	      ifa))))
+			continue;
+		inet_del_ifa(in_dev, ifap, 1);
+		return 0;
+	}
+out:
+	return -EADDRNOTAVAIL;
+}
+
+static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
+{
+	struct rtattr **rta = arg;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	struct ifaddrmsg *ifm = NLMSG_DATA(nlh);
+	struct in_ifaddr *ifa;
+	int rc = -EINVAL;
+
+	ASSERT_RTNL();
+
+	if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1])
+		goto out;
+
+	rc = -ENODEV;
+	if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL)
+		goto out;
+
+	rc = -ENOBUFS;
+	if ((in_dev = __in_dev_get(dev)) == NULL) {
+		in_dev = inetdev_init(dev);
+		if (!in_dev)
+			goto out;
+	}
+
+	if ((ifa = inet_alloc_ifa()) == NULL)
+		goto out;
+
+	if (!rta[IFA_ADDRESS - 1])
+		rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1];
+	memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4);
+	memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4);
+	ifa->ifa_prefixlen = ifm->ifa_prefixlen;
+	ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen);
+	if (rta[IFA_BROADCAST - 1])
+		memcpy(&ifa->ifa_broadcast,
+		       RTA_DATA(rta[IFA_BROADCAST - 1]), 4);
+	if (rta[IFA_ANYCAST - 1])
+		memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4);
+	ifa->ifa_flags = ifm->ifa_flags;
+	ifa->ifa_scope = ifm->ifa_scope;
+	in_dev_hold(in_dev);
+	ifa->ifa_dev   = in_dev;
+	if (rta[IFA_LABEL - 1])
+		rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ);
+	else
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+
+	rc = inet_insert_ifa(ifa);
+out:
+	return rc;
+}
+
+/*
+ *	Determine a default network mask, based on the IP address.
+ */
+
+static __inline__ int inet_abc_len(u32 addr)
+{
+	int rc = -1;	/* Something else, probably a multicast. */
+
+  	if (ZERONET(addr))
+  		rc = 0;
+	else {
+		addr = ntohl(addr);
+
+		if (IN_CLASSA(addr))
+			rc = 8;
+		else if (IN_CLASSB(addr))
+			rc = 16;
+		else if (IN_CLASSC(addr))
+			rc = 24;
+	}
+
+  	return rc;
+}
+
+
+int devinet_ioctl(unsigned int cmd, void __user *arg)
+{
+	struct ifreq ifr;
+	struct sockaddr_in sin_orig;
+	struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+	struct in_device *in_dev;
+	struct in_ifaddr **ifap = NULL;
+	struct in_ifaddr *ifa = NULL;
+	struct net_device *dev;
+	char *colon;
+	int ret = -EFAULT;
+	int tryaddrmatch = 0;
+
+	/*
+	 *	Fetch the caller's info block into kernel space
+	 */
+
+	if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
+		goto out;
+	ifr.ifr_name[IFNAMSIZ - 1] = 0;
+
+	/* save original address for comparison */
+	memcpy(&sin_orig, sin, sizeof(*sin));
+
+	colon = strchr(ifr.ifr_name, ':');
+	if (colon)
+		*colon = 0;
+
+#ifdef CONFIG_KMOD
+	dev_load(ifr.ifr_name);
+#endif
+
+	switch(cmd) {
+	case SIOCGIFADDR:	/* Get interface address */
+	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+	case SIOCGIFDSTADDR:	/* Get the destination address */
+	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		/* Note that these ioctls will not sleep,
+		   so that we do not impose a lock.
+		   One day we will be forced to put shlock here (I mean SMP)
+		 */
+		tryaddrmatch = (sin_orig.sin_family == AF_INET);
+		memset(sin, 0, sizeof(*sin));
+		sin->sin_family = AF_INET;
+		break;
+
+	case SIOCSIFFLAGS:
+		ret = -EACCES;
+		if (!capable(CAP_NET_ADMIN))
+			goto out;
+		break;
+	case SIOCSIFADDR:	/* Set interface address (and family) */
+	case SIOCSIFBRDADDR:	/* Set the broadcast address */
+	case SIOCSIFDSTADDR:	/* Set the destination address */
+	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+		ret = -EACCES;
+		if (!capable(CAP_NET_ADMIN))
+			goto out;
+		ret = -EINVAL;
+		if (sin->sin_family != AF_INET)
+			goto out;
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rtnl_lock();
+
+	ret = -ENODEV;
+	if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL)
+		goto done;
+
+	if (colon)
+		*colon = ':';
+
+	if ((in_dev = __in_dev_get(dev)) != NULL) {
+		if (tryaddrmatch) {
+			/* Matthias Andree */
+			/* compare label and address (4.4BSD style) */
+			/* note: we only do this for a limited set of ioctls
+			   and only if the original address family was AF_INET.
+			   This is checked above. */
+			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+			     ifap = &ifa->ifa_next) {
+				if (!strcmp(ifr.ifr_name, ifa->ifa_label) &&
+				    sin_orig.sin_addr.s_addr ==
+							ifa->ifa_address) {
+					break; /* found */
+				}
+			}
+		}
+		/* we didn't get a match, maybe the application is
+		   4.3BSD-style and passed in junk so we fall back to
+		   comparing just the label */
+		if (!ifa) {
+			for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
+			     ifap = &ifa->ifa_next)
+				if (!strcmp(ifr.ifr_name, ifa->ifa_label))
+					break;
+		}
+	}
+
+	ret = -EADDRNOTAVAIL;
+	if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS)
+		goto done;
+
+	switch(cmd) {
+	case SIOCGIFADDR:	/* Get interface address */
+		sin->sin_addr.s_addr = ifa->ifa_local;
+		goto rarok;
+
+	case SIOCGIFBRDADDR:	/* Get the broadcast address */
+		sin->sin_addr.s_addr = ifa->ifa_broadcast;
+		goto rarok;
+
+	case SIOCGIFDSTADDR:	/* Get the destination address */
+		sin->sin_addr.s_addr = ifa->ifa_address;
+		goto rarok;
+
+	case SIOCGIFNETMASK:	/* Get the netmask for the interface */
+		sin->sin_addr.s_addr = ifa->ifa_mask;
+		goto rarok;
+
+	case SIOCSIFFLAGS:
+		if (colon) {
+			ret = -EADDRNOTAVAIL;
+			if (!ifa)
+				break;
+			ret = 0;
+			if (!(ifr.ifr_flags & IFF_UP))
+				inet_del_ifa(in_dev, ifap, 1);
+			break;
+		}
+		ret = dev_change_flags(dev, ifr.ifr_flags);
+		break;
+
+	case SIOCSIFADDR:	/* Set interface address (and family) */
+		ret = -EINVAL;
+		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+			break;
+
+		if (!ifa) {
+			ret = -ENOBUFS;
+			if ((ifa = inet_alloc_ifa()) == NULL)
+				break;
+			if (colon)
+				memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
+			else
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+		} else {
+			ret = 0;
+			if (ifa->ifa_local == sin->sin_addr.s_addr)
+				break;
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_broadcast = 0;
+			ifa->ifa_anycast = 0;
+		}
+
+		ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr;
+
+		if (!(dev->flags & IFF_POINTOPOINT)) {
+			ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address);
+			ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen);
+			if ((dev->flags & IFF_BROADCAST) &&
+			    ifa->ifa_prefixlen < 31)
+				ifa->ifa_broadcast = ifa->ifa_address |
+						     ~ifa->ifa_mask;
+		} else {
+			ifa->ifa_prefixlen = 32;
+			ifa->ifa_mask = inet_make_mask(32);
+		}
+		ret = inet_set_ifa(dev, ifa);
+		break;
+
+	case SIOCSIFBRDADDR:	/* Set the broadcast address */
+		ret = 0;
+		if (ifa->ifa_broadcast != sin->sin_addr.s_addr) {
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_broadcast = sin->sin_addr.s_addr;
+			inet_insert_ifa(ifa);
+		}
+		break;
+
+	case SIOCSIFDSTADDR:	/* Set the destination address */
+		ret = 0;
+		if (ifa->ifa_address == sin->sin_addr.s_addr)
+			break;
+		ret = -EINVAL;
+		if (inet_abc_len(sin->sin_addr.s_addr) < 0)
+			break;
+		ret = 0;
+		inet_del_ifa(in_dev, ifap, 0);
+		ifa->ifa_address = sin->sin_addr.s_addr;
+		inet_insert_ifa(ifa);
+		break;
+
+	case SIOCSIFNETMASK: 	/* Set the netmask for the interface */
+
+		/*
+		 *	The mask we set must be legal.
+		 */
+		ret = -EINVAL;
+		if (bad_mask(sin->sin_addr.s_addr, 0))
+			break;
+		ret = 0;
+		if (ifa->ifa_mask != sin->sin_addr.s_addr) {
+			inet_del_ifa(in_dev, ifap, 0);
+			ifa->ifa_mask = sin->sin_addr.s_addr;
+			ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask);
+
+			/* See if current broadcast address matches
+			 * with current netmask, then recalculate
+			 * the broadcast address. Otherwise it's a
+			 * funny address, so don't touch it since
+			 * the user seems to know what (s)he's doing...
+			 */
+			if ((dev->flags & IFF_BROADCAST) &&
+			    (ifa->ifa_prefixlen < 31) &&
+			    (ifa->ifa_broadcast ==
+			     (ifa->ifa_local|~ifa->ifa_mask))) {
+				ifa->ifa_broadcast = (ifa->ifa_local |
+						      ~sin->sin_addr.s_addr);
+			}
+			inet_insert_ifa(ifa);
+		}
+		break;
+	}
+done:
+	rtnl_unlock();
+out:
+	return ret;
+rarok:
+	rtnl_unlock();
+	ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0;
+	goto out;
+}
+
+static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
+{
+	struct in_device *in_dev = __in_dev_get(dev);
+	struct in_ifaddr *ifa;
+	struct ifreq ifr;
+	int done = 0;
+
+	if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+		goto out;
+
+	for (; ifa; ifa = ifa->ifa_next) {
+		if (!buf) {
+			done += sizeof(ifr);
+			continue;
+		}
+		if (len < (int) sizeof(ifr))
+			break;
+		memset(&ifr, 0, sizeof(struct ifreq));
+		if (ifa->ifa_label)
+			strcpy(ifr.ifr_name, ifa->ifa_label);
+		else
+			strcpy(ifr.ifr_name, dev->name);
+
+		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET;
+		(*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr =
+								ifa->ifa_local;
+
+		if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) {
+			done = -EFAULT;
+			break;
+		}
+		buf  += sizeof(struct ifreq);
+		len  -= sizeof(struct ifreq);
+		done += sizeof(struct ifreq);
+	}
+out:
+	return done;
+}
+
+u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
+{
+	u32 addr = 0;
+	struct in_device *in_dev;
+
+	rcu_read_lock();
+	in_dev = __in_dev_get(dev);
+	if (!in_dev)
+		goto no_in_dev;
+
+	for_primary_ifa(in_dev) {
+		if (ifa->ifa_scope > scope)
+			continue;
+		if (!dst || inet_ifa_match(dst, ifa)) {
+			addr = ifa->ifa_local;
+			break;
+		}
+		if (!addr)
+			addr = ifa->ifa_local;
+	} endfor_ifa(in_dev);
+no_in_dev:
+	rcu_read_unlock();
+
+	if (addr)
+		goto out;
+
+	/* Not loopback addresses on loopback should be preferred
+	   in this case. It is importnat that lo is the first interface
+	   in dev_base list.
+	 */
+	read_lock(&dev_base_lock);
+	rcu_read_lock();
+	for (dev = dev_base; dev; dev = dev->next) {
+		if ((in_dev = __in_dev_get(dev)) == NULL)
+			continue;
+
+		for_primary_ifa(in_dev) {
+			if (ifa->ifa_scope != RT_SCOPE_LINK &&
+			    ifa->ifa_scope <= scope) {
+				addr = ifa->ifa_local;
+				goto out_unlock_both;
+			}
+		} endfor_ifa(in_dev);
+	}
+out_unlock_both:
+	read_unlock(&dev_base_lock);
+	rcu_read_unlock();
+out:
+	return addr;
+}
+
+static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst,
+			      u32 local, int scope)
+{
+	int same = 0;
+	u32 addr = 0;
+
+	for_ifa(in_dev) {
+		if (!addr &&
+		    (local == ifa->ifa_local || !local) &&
+		    ifa->ifa_scope <= scope) {
+			addr = ifa->ifa_local;
+			if (same)
+				break;
+		}
+		if (!same) {
+			same = (!local || inet_ifa_match(local, ifa)) &&
+				(!dst || inet_ifa_match(dst, ifa));
+			if (same && addr) {
+				if (local || !dst)
+					break;
+				/* Is the selected addr into dst subnet? */
+				if (inet_ifa_match(addr, ifa))
+					break;
+				/* No, then can we use new local src? */
+				if (ifa->ifa_scope <= scope) {
+					addr = ifa->ifa_local;
+					break;
+				}
+				/* search for large dst subnet for addr */
+				same = 0;
+			}
+		}
+	} endfor_ifa(in_dev);
+
+	return same? addr : 0;
+}
+
+/*
+ * Confirm that local IP address exists using wildcards:
+ * - dev: only on this interface, 0=any interface
+ * - dst: only in the same subnet as dst, 0=any dst
+ * - local: address, 0=autoselect the local address
+ * - scope: maximum allowed scope value for the local address
+ */
+u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope)
+{
+	u32 addr = 0;
+	struct in_device *in_dev;
+
+	if (dev) {
+		rcu_read_lock();
+		if ((in_dev = __in_dev_get(dev)))
+			addr = confirm_addr_indev(in_dev, dst, local, scope);
+		rcu_read_unlock();
+
+		return addr;
+	}
+
+	read_lock(&dev_base_lock);
+	rcu_read_lock();
+	for (dev = dev_base; dev; dev = dev->next) {
+		if ((in_dev = __in_dev_get(dev))) {
+			addr = confirm_addr_indev(in_dev, dst, local, scope);
+			if (addr)
+				break;
+		}
+	}
+	rcu_read_unlock();
+	read_unlock(&dev_base_lock);
+
+	return addr;
+}
+
+/*
+ *	Device notifier
+ */
+
+int register_inetaddr_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_register(&inetaddr_chain, nb);
+}
+
+int unregister_inetaddr_notifier(struct notifier_block *nb)
+{
+	return notifier_chain_unregister(&inetaddr_chain, nb);
+}
+
+/* Rename ifa_labels for a device name change. Make some effort to preserve existing
+ * alias numbering and to create unique labels if possible.
+*/
+static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
+{ 
+	struct in_ifaddr *ifa;
+	int named = 0;
+
+	for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { 
+		char old[IFNAMSIZ], *dot; 
+
+		memcpy(old, ifa->ifa_label, IFNAMSIZ);
+		memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); 
+		if (named++ == 0)
+			continue;
+		dot = strchr(ifa->ifa_label, ':');
+		if (dot == NULL) { 
+			sprintf(old, ":%d", named); 
+			dot = old;
+		}
+		if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { 
+			strcat(ifa->ifa_label, dot); 
+		} else { 
+			strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); 
+		} 
+	}	
+} 
+
+/* Called only under RTNL semaphore */
+
+static int inetdev_event(struct notifier_block *this, unsigned long event,
+			 void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct in_device *in_dev = __in_dev_get(dev);
+
+	ASSERT_RTNL();
+
+	if (!in_dev) {
+		if (event == NETDEV_REGISTER && dev == &loopback_dev) {
+			in_dev = inetdev_init(dev);
+			if (!in_dev)
+				panic("devinet: Failed to create loopback\n");
+			in_dev->cnf.no_xfrm = 1;
+			in_dev->cnf.no_policy = 1;
+		}
+		goto out;
+	}
+
+	switch (event) {
+	case NETDEV_REGISTER:
+		printk(KERN_DEBUG "inetdev_event: bug\n");
+		dev->ip_ptr = NULL;
+		break;
+	case NETDEV_UP:
+		if (dev->mtu < 68)
+			break;
+		if (dev == &loopback_dev) {
+			struct in_ifaddr *ifa;
+			if ((ifa = inet_alloc_ifa()) != NULL) {
+				ifa->ifa_local =
+				  ifa->ifa_address = htonl(INADDR_LOOPBACK);
+				ifa->ifa_prefixlen = 8;
+				ifa->ifa_mask = inet_make_mask(8);
+				in_dev_hold(in_dev);
+				ifa->ifa_dev = in_dev;
+				ifa->ifa_scope = RT_SCOPE_HOST;
+				memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
+				inet_insert_ifa(ifa);
+			}
+		}
+		ip_mc_up(in_dev);
+		break;
+	case NETDEV_DOWN:
+		ip_mc_down(in_dev);
+		break;
+	case NETDEV_CHANGEMTU:
+		if (dev->mtu >= 68)
+			break;
+		/* MTU falled under 68, disable IP */
+	case NETDEV_UNREGISTER:
+		inetdev_destroy(in_dev);
+		break;
+	case NETDEV_CHANGENAME:
+		/* Do not notify about label change, this event is
+		 * not interesting to applications using netlink.
+		 */
+		inetdev_changename(dev, in_dev);
+
+#ifdef CONFIG_SYSCTL
+		devinet_sysctl_unregister(&in_dev->cnf);
+		neigh_sysctl_unregister(in_dev->arp_parms);
+		neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4,
+				      NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+		devinet_sysctl_register(in_dev, &in_dev->cnf);
+#endif
+		break;
+	}
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ip_netdev_notifier = {
+	.notifier_call =inetdev_event,
+};
+
+static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
+			    u32 pid, u32 seq, int event)
+{
+	struct ifaddrmsg *ifm;
+	struct nlmsghdr  *nlh;
+	unsigned char	 *b = skb->tail;
+
+	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm));
+	if (pid) nlh->nlmsg_flags |= NLM_F_MULTI;
+	ifm = NLMSG_DATA(nlh);
+	ifm->ifa_family = AF_INET;
+	ifm->ifa_prefixlen = ifa->ifa_prefixlen;
+	ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT;
+	ifm->ifa_scope = ifa->ifa_scope;
+	ifm->ifa_index = ifa->ifa_dev->dev->ifindex;
+	if (ifa->ifa_address)
+		RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address);
+	if (ifa->ifa_local)
+		RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local);
+	if (ifa->ifa_broadcast)
+		RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast);
+	if (ifa->ifa_anycast)
+		RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast);
+	if (ifa->ifa_label[0])
+		RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label);
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int idx, ip_idx;
+	struct net_device *dev;
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+	int s_ip_idx, s_idx = cb->args[0];
+
+	s_ip_idx = ip_idx = cb->args[1];
+	read_lock(&dev_base_lock);
+	for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) {
+		if (idx < s_idx)
+			continue;
+		if (idx > s_idx)
+			s_ip_idx = 0;
+		rcu_read_lock();
+		if ((in_dev = __in_dev_get(dev)) == NULL) {
+			rcu_read_unlock();
+			continue;
+		}
+
+		for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+		     ifa = ifa->ifa_next, ip_idx++) {
+			if (ip_idx < s_ip_idx)
+				continue;
+			if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+					     cb->nlh->nlmsg_seq,
+					     RTM_NEWADDR) <= 0) {
+				rcu_read_unlock();
+				goto done;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+done:
+	read_unlock(&dev_base_lock);
+	cb->args[0] = idx;
+	cb->args[1] = ip_idx;
+
+	return skb->len;
+}
+
+static void rtmsg_ifa(int event, struct in_ifaddr* ifa)
+{
+	int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128);
+	struct sk_buff *skb = alloc_skb(size, GFP_KERNEL);
+
+	if (!skb)
+		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS);
+	else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) {
+		kfree_skb(skb);
+		netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL);
+	} else {
+		NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR;
+		netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL);
+	}
+}
+
+static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = {
+	 [4] = { .doit	 = inet_rtm_newaddr,  },
+	 [5] = { .doit	 = inet_rtm_deladdr,  },
+	 [6] = { .dumpit = inet_dump_ifaddr,  },
+	 [8] = { .doit	 = inet_rtm_newroute, },
+	 [9] = { .doit	 = inet_rtm_delroute, },
+	[10] = { .doit	 = inet_rtm_getroute, .dumpit = inet_dump_fib, },
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	[16] = { .doit	 = inet_rtm_newrule, },
+	[17] = { .doit	 = inet_rtm_delrule, },
+	[18] = { .dumpit = inet_dump_rules,  },
+#endif
+};
+
+#ifdef CONFIG_SYSCTL
+
+void inet_forward_change(void)
+{
+	struct net_device *dev;
+	int on = ipv4_devconf.forwarding;
+
+	ipv4_devconf.accept_redirects = !on;
+	ipv4_devconf_dflt.forwarding = on;
+
+	read_lock(&dev_base_lock);
+	for (dev = dev_base; dev; dev = dev->next) {
+		struct in_device *in_dev;
+		rcu_read_lock();
+		in_dev = __in_dev_get(dev);
+		if (in_dev)
+			in_dev->cnf.forwarding = on;
+		rcu_read_unlock();
+	}
+	read_unlock(&dev_base_lock);
+
+	rt_cache_flush(0);
+}
+
+static int devinet_sysctl_forward(ctl_table *ctl, int write,
+				  struct file* filp, void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+	if (write && *valp != val) {
+		if (valp == &ipv4_devconf.forwarding)
+			inet_forward_change();
+		else if (valp != &ipv4_devconf_dflt.forwarding)
+			rt_cache_flush(0);
+	}
+
+	return ret;
+}
+
+int ipv4_doint_and_flush(ctl_table *ctl, int write,
+			 struct file* filp, void __user *buffer,
+			 size_t *lenp, loff_t *ppos)
+{
+	int *valp = ctl->data;
+	int val = *valp;
+	int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+	if (write && *valp != val)
+		rt_cache_flush(0);
+
+	return ret;
+}
+
+int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen,
+				  void __user *oldval, size_t __user *oldlenp,
+				  void __user *newval, size_t newlen, 
+				  void **context)
+{
+	int *valp = table->data;
+	int new;
+
+	if (!newval || !newlen)
+		return 0;
+
+	if (newlen != sizeof(int))
+		return -EINVAL;
+
+	if (get_user(new, (int __user *)newval))
+		return -EFAULT;
+
+	if (new == *valp)
+		return 0;
+
+	if (oldval && oldlenp) {
+		size_t len;
+
+		if (get_user(len, oldlenp))
+			return -EFAULT;
+
+		if (len) {
+			if (len > table->maxlen)
+				len = table->maxlen;
+			if (copy_to_user(oldval, valp, len))
+				return -EFAULT;
+			if (put_user(len, oldlenp))
+				return -EFAULT;
+		}
+	}
+
+	*valp = new;
+	rt_cache_flush(0);
+	return 1;
+}
+
+
+static struct devinet_sysctl_table {
+	struct ctl_table_header *sysctl_header;
+	ctl_table		devinet_vars[__NET_IPV4_CONF_MAX];
+	ctl_table		devinet_dev[2];
+	ctl_table		devinet_conf_dir[2];
+	ctl_table		devinet_proto_dir[2];
+	ctl_table		devinet_root_dir[2];
+} devinet_sysctl = {
+	.devinet_vars = {
+		{
+			.ctl_name	= NET_IPV4_CONF_FORWARDING,
+			.procname	= "forwarding",
+			.data		= &ipv4_devconf.forwarding,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &devinet_sysctl_forward,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_MC_FORWARDING,
+			.procname	= "mc_forwarding",
+			.data		= &ipv4_devconf.mc_forwarding,
+			.maxlen		= sizeof(int),
+			.mode		= 0444,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_ACCEPT_REDIRECTS,
+			.procname	= "accept_redirects",
+			.data		= &ipv4_devconf.accept_redirects,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_SECURE_REDIRECTS,
+			.procname	= "secure_redirects",
+			.data		= &ipv4_devconf.secure_redirects,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_SHARED_MEDIA,
+			.procname	= "shared_media",
+			.data		= &ipv4_devconf.shared_media,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_RP_FILTER,
+			.procname	= "rp_filter",
+			.data		= &ipv4_devconf.rp_filter,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_SEND_REDIRECTS,
+			.procname	= "send_redirects",
+			.data		= &ipv4_devconf.send_redirects,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE,
+			.procname	= "accept_source_route",
+			.data		= &ipv4_devconf.accept_source_route,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_PROXY_ARP,
+			.procname	= "proxy_arp",
+			.data		= &ipv4_devconf.proxy_arp,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_MEDIUM_ID,
+			.procname	= "medium_id",
+			.data		= &ipv4_devconf.medium_id,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_BOOTP_RELAY,
+			.procname	= "bootp_relay",
+			.data		= &ipv4_devconf.bootp_relay,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_LOG_MARTIANS,
+			.procname	= "log_martians",
+			.data		= &ipv4_devconf.log_martians,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_TAG,
+			.procname	= "tag",
+			.data		= &ipv4_devconf.tag,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_ARPFILTER,
+			.procname	= "arp_filter",
+			.data		= &ipv4_devconf.arp_filter,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_ARP_ANNOUNCE,
+			.procname	= "arp_announce",
+			.data		= &ipv4_devconf.arp_announce,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_ARP_IGNORE,
+			.procname	= "arp_ignore",
+			.data		= &ipv4_devconf.arp_ignore,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &proc_dointvec,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_NOXFRM,
+			.procname	= "disable_xfrm",
+			.data		= &ipv4_devconf.no_xfrm,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &ipv4_doint_and_flush,
+			.strategy	= &ipv4_doint_and_flush_strategy,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_NOPOLICY,
+			.procname	= "disable_policy",
+			.data		= &ipv4_devconf.no_policy,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &ipv4_doint_and_flush,
+			.strategy	= &ipv4_doint_and_flush_strategy,
+		},
+		{
+			.ctl_name	= NET_IPV4_CONF_FORCE_IGMP_VERSION,
+			.procname	= "force_igmp_version",
+			.data		= &ipv4_devconf.force_igmp_version,
+			.maxlen		= sizeof(int),
+			.mode		= 0644,
+			.proc_handler	= &ipv4_doint_and_flush,
+			.strategy	= &ipv4_doint_and_flush_strategy,
+		},
+	},
+	.devinet_dev = {
+		{
+			.ctl_name	= NET_PROTO_CONF_ALL,
+			.procname	= "all",
+			.mode		= 0555,
+			.child		= devinet_sysctl.devinet_vars,
+		},
+	},
+	.devinet_conf_dir = {
+	        {
+			.ctl_name	= NET_IPV4_CONF,
+			.procname	= "conf",
+			.mode		= 0555,
+			.child		= devinet_sysctl.devinet_dev,
+		},
+	},
+	.devinet_proto_dir = {
+		{
+			.ctl_name	= NET_IPV4,
+			.procname	= "ipv4",
+			.mode		= 0555,
+			.child 		= devinet_sysctl.devinet_conf_dir,
+		},
+	},
+	.devinet_root_dir = {
+		{
+			.ctl_name	= CTL_NET,
+			.procname 	= "net",
+			.mode		= 0555,
+			.child		= devinet_sysctl.devinet_proto_dir,
+		},
+	},
+};
+
+static void devinet_sysctl_register(struct in_device *in_dev,
+				    struct ipv4_devconf *p)
+{
+	int i;
+	struct net_device *dev = in_dev ? in_dev->dev : NULL;
+	struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL);
+	char *dev_name = NULL;
+
+	if (!t)
+		return;
+	memcpy(t, &devinet_sysctl, sizeof(*t));
+	for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) {
+		t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf;
+		t->devinet_vars[i].de = NULL;
+	}
+
+	if (dev) {
+		dev_name = dev->name; 
+		t->devinet_dev[0].ctl_name = dev->ifindex;
+	} else {
+		dev_name = "default";
+		t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT;
+	}
+
+	/* 
+	 * Make a copy of dev_name, because '.procname' is regarded as const 
+	 * by sysctl and we wouldn't want anyone to change it under our feet
+	 * (see SIOCSIFNAME).
+	 */	
+	dev_name = net_sysctl_strdup(dev_name);
+	if (!dev_name)
+	    goto free;
+
+	t->devinet_dev[0].procname    = dev_name;
+	t->devinet_dev[0].child	      = t->devinet_vars;
+	t->devinet_dev[0].de	      = NULL;
+	t->devinet_conf_dir[0].child  = t->devinet_dev;
+	t->devinet_conf_dir[0].de     = NULL;
+	t->devinet_proto_dir[0].child = t->devinet_conf_dir;
+	t->devinet_proto_dir[0].de    = NULL;
+	t->devinet_root_dir[0].child  = t->devinet_proto_dir;
+	t->devinet_root_dir[0].de     = NULL;
+
+	t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0);
+	if (!t->sysctl_header)
+	    goto free_procname;
+
+	p->sysctl = t;
+	return;
+
+	/* error path */
+ free_procname:
+	kfree(dev_name);
+ free:
+	kfree(t);
+	return;
+}
+
+static void devinet_sysctl_unregister(struct ipv4_devconf *p)
+{
+	if (p->sysctl) {
+		struct devinet_sysctl_table *t = p->sysctl;
+		p->sysctl = NULL;
+		unregister_sysctl_table(t->sysctl_header);
+		kfree(t->devinet_dev[0].procname);
+		kfree(t);
+	}
+}
+#endif
+
+void __init devinet_init(void)
+{
+	register_gifconf(PF_INET, inet_gifconf);
+	register_netdevice_notifier(&ip_netdev_notifier);
+	rtnetlink_links[PF_INET] = inet_rtnetlink_table;
+#ifdef CONFIG_SYSCTL
+	devinet_sysctl.sysctl_header =
+		register_sysctl_table(devinet_sysctl.devinet_root_dir, 0);
+	devinet_sysctl_register(NULL, &ipv4_devconf_dflt);
+#endif
+}
+
+EXPORT_SYMBOL(devinet_ioctl);
+EXPORT_SYMBOL(in_dev_finish_destroy);
+EXPORT_SYMBOL(inet_select_addr);
+EXPORT_SYMBOL(inetdev_by_index);
+EXPORT_SYMBOL(register_inetaddr_notifier);
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
new file mode 100644
index 000000000000..053a883247ba
--- /dev/null
+++ b/net/ipv4/esp4.c
@@ -0,0 +1,510 @@
+#include <linux/config.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+
+/* decapsulation data for use when post-processing */
+struct esp_decap_data {
+	xfrm_address_t	saddr;
+	__u16		sport;
+	__u8		proto;
+};
+
+static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	struct iphdr *top_iph;
+	struct ip_esp_hdr *esph;
+	struct crypto_tfm *tfm;
+	struct esp_data *esp;
+	struct sk_buff *trailer;
+	int blksize;
+	int clen;
+	int alen;
+	int nfrags;
+
+	/* Strip IP+ESP header. */
+	__skb_pull(skb, skb->h.raw - skb->data);
+	/* Now skb is pure payload to encrypt */
+
+	err = -ENOMEM;
+
+	/* Round to block size */
+	clen = skb->len;
+
+	esp = x->data;
+	alen = esp->auth.icv_trunc_len;
+	tfm = esp->conf.tfm;
+	blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
+	clen = (clen + 2 + blksize-1)&~(blksize-1);
+	if (esp->conf.padlen)
+		clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+
+	if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
+		goto error;
+
+	/* Fill padding... */
+	do {
+		int i;
+		for (i=0; i<clen-skb->len - 2; i++)
+			*(u8*)(trailer->tail + i) = i+1;
+	} while (0);
+	*(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2;
+	pskb_put(skb, trailer, clen - skb->len);
+
+	__skb_push(skb, skb->data - skb->nh.raw);
+	top_iph = skb->nh.iph;
+	esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4);
+	top_iph->tot_len = htons(skb->len + alen);
+	*(u8*)(trailer->tail - 1) = top_iph->protocol;
+
+	/* this is non-NULL only with UDP Encapsulation */
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+		struct udphdr *uh;
+		u32 *udpdata32;
+
+		uh = (struct udphdr *)esph;
+		uh->source = encap->encap_sport;
+		uh->dest = encap->encap_dport;
+		uh->len = htons(skb->len + alen - top_iph->ihl*4);
+		uh->check = 0;
+
+		switch (encap->encap_type) {
+		default:
+		case UDP_ENCAP_ESPINUDP:
+			esph = (struct ip_esp_hdr *)(uh + 1);
+			break;
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			udpdata32 = (u32 *)(uh + 1);
+			udpdata32[0] = udpdata32[1] = 0;
+			esph = (struct ip_esp_hdr *)(udpdata32 + 2);
+			break;
+		}
+
+		top_iph->protocol = IPPROTO_UDP;
+	} else
+		top_iph->protocol = IPPROTO_ESP;
+
+	esph->spi = x->id.spi;
+	esph->seq_no = htonl(++x->replay.oseq);
+
+	if (esp->conf.ivlen)
+		crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+
+	do {
+		struct scatterlist *sg = &esp->sgbuf[0];
+
+		if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+			sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+			if (!sg)
+				goto error;
+		}
+		skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen);
+		crypto_cipher_encrypt(tfm, sg, sg, clen);
+		if (unlikely(sg != &esp->sgbuf[0]))
+			kfree(sg);
+	} while (0);
+
+	if (esp->conf.ivlen) {
+		memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+		crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm));
+	}
+
+	if (esp->auth.icv_full_len) {
+		esp->auth.icv(esp, skb, (u8*)esph-skb->data,
+		              sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail);
+		pskb_put(skb, trailer, alen);
+	}
+
+	ip_send_check(top_iph);
+
+	err = 0;
+
+error:
+	return err;
+}
+
+/*
+ * Note: detecting truncated vs. non-truncated authentication data is very
+ * expensive, so we only support truncated data, which is the recommended
+ * and common case.
+ */
+static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	struct ip_esp_hdr *esph;
+	struct esp_data *esp = x->data;
+	struct sk_buff *trailer;
+	int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+	int alen = esp->auth.icv_trunc_len;
+	int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
+	int nfrags;
+	int encap_len = 0;
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr)))
+		goto out;
+
+	if (elen <= 0 || (elen & (blksize-1)))
+		goto out;
+
+	/* If integrity check is required, do this. */
+	if (esp->auth.icv_full_len) {
+		u8 sum[esp->auth.icv_full_len];
+		u8 sum1[alen];
+		
+		esp->auth.icv(esp, skb, 0, skb->len-alen, sum);
+
+		if (skb_copy_bits(skb, skb->len-alen, sum1, alen))
+			BUG();
+
+		if (unlikely(memcmp(sum, sum1, alen))) {
+			x->stats.integrity_failed++;
+			goto out;
+		}
+	}
+
+	if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0)
+		goto out;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	esph = (struct ip_esp_hdr*)skb->data;
+	iph = skb->nh.iph;
+
+	/* Get ivec. This can be wrong, check against another impls. */
+	if (esp->conf.ivlen)
+		crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm));
+
+        {
+		u8 nexthdr[2];
+		struct scatterlist *sg = &esp->sgbuf[0];
+		u8 workbuf[60];
+		int padlen;
+
+		if (unlikely(nfrags > ESP_NUM_FAST_SG)) {
+			sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC);
+			if (!sg)
+				goto out;
+		}
+		skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen);
+		crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen);
+		if (unlikely(sg != &esp->sgbuf[0]))
+			kfree(sg);
+
+		if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2))
+			BUG();
+
+		padlen = nexthdr[0];
+		if (padlen+2 >= elen)
+			goto out;
+
+		/* ... check padding bits here. Silly. :-) */ 
+
+		if (x->encap && decap && decap->decap_type) {
+			struct esp_decap_data *encap_data;
+			struct udphdr *uh = (struct udphdr *) (iph+1);
+
+			encap_data = (struct esp_decap_data *) (decap->decap_data);
+			encap_data->proto = 0;
+
+			switch (decap->decap_type) {
+			case UDP_ENCAP_ESPINUDP:
+			case UDP_ENCAP_ESPINUDP_NON_IKE:
+				encap_data->proto = AF_INET;
+				encap_data->saddr.a4 = iph->saddr;
+				encap_data->sport = uh->source;
+				encap_len = (void*)esph - (void*)uh;
+				break;
+
+			default:
+				goto out;
+			}
+		}
+
+		iph->protocol = nexthdr[1];
+		pskb_trim(skb, skb->len - alen - padlen - 2);
+		memcpy(workbuf, skb->nh.raw, iph->ihl*4);
+		skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen);
+		skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+		memcpy(skb->nh.raw, workbuf, iph->ihl*4);
+		skb->nh.iph->tot_len = htons(skb->len);
+	}
+
+	return 0;
+
+out:
+	return -EINVAL;
+}
+
+static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+  
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap;
+		struct esp_decap_data *decap_data;
+
+		encap = x->encap;
+		decap_data = (struct esp_decap_data *)(decap->decap_data);
+
+		/* first, make sure that the decap type == the encap type */
+		if (encap->encap_type != decap->decap_type)
+			return -EINVAL;
+
+		switch (encap->encap_type) {
+		default:
+		case UDP_ENCAP_ESPINUDP:
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			/*
+			 * 1) if the NAT-T peer's IP or port changed then
+			 *    advertize the change to the keying daemon.
+			 *    This is an inbound SA, so just compare
+			 *    SRC ports.
+			 */
+			if (decap_data->proto == AF_INET &&
+			    (decap_data->saddr.a4 != x->props.saddr.a4 ||
+			     decap_data->sport != encap->encap_sport)) {
+				xfrm_address_t ipaddr;
+
+				ipaddr.a4 = decap_data->saddr.a4;
+				km_new_mapping(x, &ipaddr, decap_data->sport);
+					
+				/* XXX: perhaps add an extra
+				 * policy check here, to see
+				 * if we should allow or
+				 * reject a packet from a
+				 * different source
+				 * address/port.
+				 */
+			}
+		
+			/*
+			 * 2) ignore UDP/TCP checksums in case
+			 *    of NAT-T in Transport Mode, or
+			 *    perform other post-processing fixes
+			 *    as per * draft-ietf-ipsec-udp-encaps-06,
+			 *    section 3.1.2
+			 */
+			if (!x->props.mode)
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+			break;
+		}
+	}
+	return 0;
+}
+
+static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
+{
+	struct esp_data *esp = x->data;
+	u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+
+	if (x->props.mode) {
+		mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+	} else {
+		/* The worst case. */
+		mtu += 2 + blksize;
+	}
+	if (esp->conf.padlen)
+		mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+
+	return mtu + x->props.header_len + esp->auth.icv_trunc_len;
+}
+
+static void esp4_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr*)skb->data;
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+	    skb->h.icmph->code != ICMP_FRAG_NEEDED)
+		return;
+
+	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+	if (!x)
+		return;
+	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
+			ntohl(esph->spi), ntohl(iph->daddr)));
+	xfrm_state_put(x);
+}
+
+static void esp_destroy(struct xfrm_state *x)
+{
+	struct esp_data *esp = x->data;
+
+	if (!esp)
+		return;
+
+	if (esp->conf.tfm) {
+		crypto_free_tfm(esp->conf.tfm);
+		esp->conf.tfm = NULL;
+	}
+	if (esp->conf.ivec) {
+		kfree(esp->conf.ivec);
+		esp->conf.ivec = NULL;
+	}
+	if (esp->auth.tfm) {
+		crypto_free_tfm(esp->auth.tfm);
+		esp->auth.tfm = NULL;
+	}
+	if (esp->auth.work_icv) {
+		kfree(esp->auth.work_icv);
+		esp->auth.work_icv = NULL;
+	}
+	kfree(esp);
+}
+
+static int esp_init_state(struct xfrm_state *x, void *args)
+{
+	struct esp_data *esp = NULL;
+
+	/* null auth and encryption can have zero length keys */
+	if (x->aalg) {
+		if (x->aalg->alg_key_len > 512)
+			goto error;
+	}
+	if (x->ealg == NULL)
+		goto error;
+
+	esp = kmalloc(sizeof(*esp), GFP_KERNEL);
+	if (esp == NULL)
+		return -ENOMEM;
+
+	memset(esp, 0, sizeof(*esp));
+
+	if (x->aalg) {
+		struct xfrm_algo_desc *aalg_desc;
+
+		esp->auth.key = x->aalg->alg_key;
+		esp->auth.key_len = (x->aalg->alg_key_len+7)/8;
+		esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0);
+		if (esp->auth.tfm == NULL)
+			goto error;
+		esp->auth.icv = esp_hmac_digest;
+
+		aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
+		BUG_ON(!aalg_desc);
+
+		if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
+		    crypto_tfm_alg_digestsize(esp->auth.tfm)) {
+			NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n",
+			       x->aalg->alg_name,
+			       crypto_tfm_alg_digestsize(esp->auth.tfm),
+			       aalg_desc->uinfo.auth.icv_fullbits/8));
+			goto error;
+		}
+
+		esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
+		esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+
+		esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL);
+		if (!esp->auth.work_icv)
+			goto error;
+	}
+	esp->conf.key = x->ealg->alg_key;
+	esp->conf.key_len = (x->ealg->alg_key_len+7)/8;
+	if (x->props.ealgo == SADB_EALG_NULL)
+		esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB);
+	else
+		esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC);
+	if (esp->conf.tfm == NULL)
+		goto error;
+	esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm);
+	esp->conf.padlen = 0;
+	if (esp->conf.ivlen) {
+		esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL);
+		if (unlikely(esp->conf.ivec == NULL))
+			goto error;
+		get_random_bytes(esp->conf.ivec, esp->conf.ivlen);
+	}
+	if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len))
+		goto error;
+	x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen;
+	if (x->props.mode)
+		x->props.header_len += sizeof(struct iphdr);
+	if (x->encap) {
+		struct xfrm_encap_tmpl *encap = x->encap;
+
+		switch (encap->encap_type) {
+		default:
+			goto error;
+		case UDP_ENCAP_ESPINUDP:
+			x->props.header_len += sizeof(struct udphdr);
+			break;
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32);
+			break;
+		}
+	}
+	x->data = esp;
+	x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len;
+	return 0;
+
+error:
+	x->data = esp;
+	esp_destroy(x);
+	x->data = NULL;
+	return -EINVAL;
+}
+
+static struct xfrm_type esp_type =
+{
+	.description	= "ESP4",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_ESP,
+	.init_state	= esp_init_state,
+	.destructor	= esp_destroy,
+	.get_max_size	= esp4_get_max_size,
+	.input		= esp_input,
+	.post_input	= esp_post_input,
+	.output		= esp_output
+};
+
+static struct net_protocol esp4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.err_handler	=	esp4_err,
+	.no_policy	=	1,
+};
+
+static int __init esp4_init(void)
+{
+	struct xfrm_decap_state decap;
+
+	if (sizeof(struct esp_decap_data)  <
+	    sizeof(decap.decap_data)) {
+		extern void decap_data_too_small(void);
+
+		decap_data_too_small();
+	}
+
+	if (xfrm_register_type(&esp_type, AF_INET) < 0) {
+		printk(KERN_INFO "ip esp init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) {
+		printk(KERN_INFO "ip esp init: can't add protocol\n");
+		xfrm_unregister_type(&esp_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit esp4_fini(void)
+{
+	if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0)
+		printk(KERN_INFO "ip esp close: can't remove protocol\n");
+	if (xfrm_unregister_type(&esp_type, AF_INET) < 0)
+		printk(KERN_INFO "ip esp close: can't remove xfrm type\n");
+}
+
+module_init(esp4_init);
+module_exit(esp4_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
new file mode 100644
index 000000000000..563e7d612706
--- /dev/null
+++ b/net/ipv4/fib_frontend.c
@@ -0,0 +1,611 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: FIB frontend.
+ *
+ * Version:	$Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/arp.h>
+#include <net/ip_fib.h>
+
+#define FFprint(a...) printk(KERN_DEBUG a)
+
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+
+#define RT_TABLE_MIN RT_TABLE_MAIN
+
+struct fib_table *ip_fib_local_table;
+struct fib_table *ip_fib_main_table;
+
+#else
+
+#define RT_TABLE_MIN 1
+
+struct fib_table *fib_tables[RT_TABLE_MAX+1];
+
+struct fib_table *__fib_new_table(int id)
+{
+	struct fib_table *tb;
+
+	tb = fib_hash_init(id);
+	if (!tb)
+		return NULL;
+	fib_tables[id] = tb;
+	return tb;
+}
+
+
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+
+static void fib_flush(void)
+{
+	int flushed = 0;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	struct fib_table *tb;
+	int id;
+
+	for (id = RT_TABLE_MAX; id>0; id--) {
+		if ((tb = fib_get_table(id))==NULL)
+			continue;
+		flushed += tb->tb_flush(tb);
+	}
+#else /* CONFIG_IP_MULTIPLE_TABLES */
+	flushed += ip_fib_main_table->tb_flush(ip_fib_main_table);
+	flushed += ip_fib_local_table->tb_flush(ip_fib_local_table);
+#endif /* CONFIG_IP_MULTIPLE_TABLES */
+
+	if (flushed)
+		rt_cache_flush(-1);
+}
+
+/*
+ *	Find the first device with a given source address.
+ */
+
+struct net_device * ip_dev_find(u32 addr)
+{
+	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+	struct fib_result res;
+	struct net_device *dev = NULL;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r = NULL;
+#endif
+
+	if (!ip_fib_local_table ||
+	    ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res))
+		return NULL;
+	if (res.type != RTN_LOCAL)
+		goto out;
+	dev = FIB_RES_DEV(res);
+
+	if (dev)
+		dev_hold(dev);
+out:
+	fib_res_put(&res);
+	return dev;
+}
+
+unsigned inet_addr_type(u32 addr)
+{
+	struct flowi		fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+	struct fib_result	res;
+	unsigned ret = RTN_BROADCAST;
+
+	if (ZERONET(addr) || BADCLASS(addr))
+		return RTN_BROADCAST;
+	if (MULTICAST(addr))
+		return RTN_MULTICAST;
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r = NULL;
+#endif
+	
+	if (ip_fib_local_table) {
+		ret = RTN_UNICAST;
+		if (!ip_fib_local_table->tb_lookup(ip_fib_local_table,
+						   &fl, &res)) {
+			ret = res.type;
+			fib_res_put(&res);
+		}
+	}
+	return ret;
+}
+
+/* Given (packet source, input interface) and optional (dst, oif, tos):
+   - (main) check, that source is valid i.e. not broadcast or our local
+     address.
+   - figure out what "logical" interface this packet arrived
+     and calculate "specific destination" address.
+   - check, that packet arrived from expected physical interface.
+ */
+
+int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
+			struct net_device *dev, u32 *spec_dst, u32 *itag)
+{
+	struct in_device *in_dev;
+	struct flowi fl = { .nl_u = { .ip4_u =
+				      { .daddr = src,
+					.saddr = dst,
+					.tos = tos } },
+			    .iif = oif };
+	struct fib_result res;
+	int no_addr, rpf;
+	int ret;
+
+	no_addr = rpf = 0;
+	rcu_read_lock();
+	in_dev = __in_dev_get(dev);
+	if (in_dev) {
+		no_addr = in_dev->ifa_list == NULL;
+		rpf = IN_DEV_RPFILTER(in_dev);
+	}
+	rcu_read_unlock();
+
+	if (in_dev == NULL)
+		goto e_inval;
+
+	if (fib_lookup(&fl, &res))
+		goto last_resort;
+	if (res.type != RTN_UNICAST)
+		goto e_inval_res;
+	*spec_dst = FIB_RES_PREFSRC(res);
+	fib_combine_itag(itag, &res);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1)
+#else
+	if (FIB_RES_DEV(res) == dev)
+#endif
+	{
+		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+		fib_res_put(&res);
+		return ret;
+	}
+	fib_res_put(&res);
+	if (no_addr)
+		goto last_resort;
+	if (rpf)
+		goto e_inval;
+	fl.oif = dev->ifindex;
+
+	ret = 0;
+	if (fib_lookup(&fl, &res) == 0) {
+		if (res.type == RTN_UNICAST) {
+			*spec_dst = FIB_RES_PREFSRC(res);
+			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
+		}
+		fib_res_put(&res);
+	}
+	return ret;
+
+last_resort:
+	if (rpf)
+		goto e_inval;
+	*spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	*itag = 0;
+	return 0;
+
+e_inval_res:
+	fib_res_put(&res);
+e_inval:
+	return -EINVAL;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+/*
+ *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ */
+ 
+int ip_rt_ioctl(unsigned int cmd, void __user *arg)
+{
+	int err;
+	struct kern_rta rta;
+	struct rtentry  r;
+	struct {
+		struct nlmsghdr nlh;
+		struct rtmsg	rtm;
+	} req;
+
+	switch (cmd) {
+	case SIOCADDRT:		/* Add a route */
+	case SIOCDELRT:		/* Delete a route */
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+		if (copy_from_user(&r, arg, sizeof(struct rtentry)))
+			return -EFAULT;
+		rtnl_lock();
+		err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r);
+		if (err == 0) {
+			if (cmd == SIOCDELRT) {
+				struct fib_table *tb = fib_get_table(req.rtm.rtm_table);
+				err = -ESRCH;
+				if (tb)
+					err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+			} else {
+				struct fib_table *tb = fib_new_table(req.rtm.rtm_table);
+				err = -ENOBUFS;
+				if (tb)
+					err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+			}
+			if (rta.rta_mx)
+				kfree(rta.rta_mx);
+		}
+		rtnl_unlock();
+		return err;
+	}
+	return -EINVAL;
+}
+
+#else
+
+int ip_rt_ioctl(unsigned int cmd, void *arg)
+{
+	return -EINVAL;
+}
+
+#endif
+
+static int inet_check_attr(struct rtmsg *r, struct rtattr **rta)
+{
+	int i;
+
+	for (i=1; i<=RTA_MAX; i++) {
+		struct rtattr *attr = rta[i-1];
+		if (attr) {
+			if (RTA_PAYLOAD(attr) < 4)
+				return -EINVAL;
+			if (i != RTA_MULTIPATH && i != RTA_METRICS)
+				rta[i-1] = (struct rtattr*)RTA_DATA(attr);
+		}
+	}
+	return 0;
+}
+
+int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct fib_table * tb;
+	struct rtattr **rta = arg;
+	struct rtmsg *r = NLMSG_DATA(nlh);
+
+	if (inet_check_attr(r, rta))
+		return -EINVAL;
+
+	tb = fib_get_table(r->rtm_table);
+	if (tb)
+		return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+	return -ESRCH;
+}
+
+int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct fib_table * tb;
+	struct rtattr **rta = arg;
+	struct rtmsg *r = NLMSG_DATA(nlh);
+
+	if (inet_check_attr(r, rta))
+		return -EINVAL;
+
+	tb = fib_new_table(r->rtm_table);
+	if (tb)
+		return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb));
+	return -ENOBUFS;
+}
+
+int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int t;
+	int s_t;
+	struct fib_table *tb;
+
+	if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) &&
+	    ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)
+		return ip_rt_dump(skb, cb);
+
+	s_t = cb->args[0];
+	if (s_t == 0)
+		s_t = cb->args[0] = RT_TABLE_MIN;
+
+	for (t=s_t; t<=RT_TABLE_MAX; t++) {
+		if (t < s_t) continue;
+		if (t > s_t)
+			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
+		if ((tb = fib_get_table(t))==NULL)
+			continue;
+		if (tb->tb_dump(tb, skb, cb) < 0) 
+			break;
+	}
+
+	cb->args[0] = t;
+
+	return skb->len;
+}
+
+/* Prepare and feed intra-kernel routing request.
+   Really, it should be netlink message, but :-( netlink
+   can be not configured, so that we feed it directly
+   to fib engine. It is legal, because all events occur
+   only when netlink is already locked.
+ */
+
+static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa)
+{
+	struct fib_table * tb;
+	struct {
+		struct nlmsghdr	nlh;
+		struct rtmsg	rtm;
+	} req;
+	struct kern_rta rta;
+
+	memset(&req.rtm, 0, sizeof(req.rtm));
+	memset(&rta, 0, sizeof(rta));
+
+	if (type == RTN_UNICAST)
+		tb = fib_new_table(RT_TABLE_MAIN);
+	else
+		tb = fib_new_table(RT_TABLE_LOCAL);
+
+	if (tb == NULL)
+		return;
+
+	req.nlh.nlmsg_len = sizeof(req);
+	req.nlh.nlmsg_type = cmd;
+	req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND;
+	req.nlh.nlmsg_pid = 0;
+	req.nlh.nlmsg_seq = 0;
+
+	req.rtm.rtm_dst_len = dst_len;
+	req.rtm.rtm_table = tb->tb_id;
+	req.rtm.rtm_protocol = RTPROT_KERNEL;
+	req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST);
+	req.rtm.rtm_type = type;
+
+	rta.rta_dst = &dst;
+	rta.rta_prefsrc = &ifa->ifa_local;
+	rta.rta_oif = &ifa->ifa_dev->dev->ifindex;
+
+	if (cmd == RTM_NEWROUTE)
+		tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL);
+	else
+		tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL);
+}
+
+static void fib_add_ifaddr(struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+	struct in_ifaddr *prim = ifa;
+	u32 mask = ifa->ifa_mask;
+	u32 addr = ifa->ifa_local;
+	u32 prefix = ifa->ifa_address&mask;
+
+	if (ifa->ifa_flags&IFA_F_SECONDARY) {
+		prim = inet_ifa_byprefix(in_dev, prefix, mask);
+		if (prim == NULL) {
+			printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n");
+			return;
+		}
+	}
+
+	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
+
+	if (!(dev->flags&IFF_UP))
+		return;
+
+	/* Add broadcast address, if it is explicitly assigned. */
+	if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF)
+		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+
+	if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
+		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+
+		/* Add network specific broadcasts, when it takes a sense */
+		if (ifa->ifa_prefixlen < 31) {
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+		}
+	}
+}
+
+static void fib_del_ifaddr(struct in_ifaddr *ifa)
+{
+	struct in_device *in_dev = ifa->ifa_dev;
+	struct net_device *dev = in_dev->dev;
+	struct in_ifaddr *ifa1;
+	struct in_ifaddr *prim = ifa;
+	u32 brd = ifa->ifa_address|~ifa->ifa_mask;
+	u32 any = ifa->ifa_address&ifa->ifa_mask;
+#define LOCAL_OK	1
+#define BRD_OK		2
+#define BRD0_OK		4
+#define BRD1_OK		8
+	unsigned ok = 0;
+
+	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
+		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
+			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+	else {
+		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
+		if (prim == NULL) {
+			printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n");
+			return;
+		}
+	}
+
+	/* Deletion is more complicated than add.
+	   We should take care of not to delete too much :-)
+
+	   Scan address list to be sure that addresses are really gone.
+	 */
+
+	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
+		if (ifa->ifa_local == ifa1->ifa_local)
+			ok |= LOCAL_OK;
+		if (ifa->ifa_broadcast == ifa1->ifa_broadcast)
+			ok |= BRD_OK;
+		if (brd == ifa1->ifa_broadcast)
+			ok |= BRD1_OK;
+		if (any == ifa1->ifa_broadcast)
+			ok |= BRD0_OK;
+	}
+
+	if (!(ok&BRD_OK))
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
+	if (!(ok&BRD1_OK))
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
+	if (!(ok&BRD0_OK))
+		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
+	if (!(ok&LOCAL_OK)) {
+		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
+
+		/* Check, that this local address finally disappeared. */
+		if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) {
+			/* And the last, but not the least thing.
+			   We must flush stray FIB entries.
+
+			   First of all, we scan fib_info list searching
+			   for stray nexthop entries, then ignite fib_flush.
+			*/
+			if (fib_sync_down(ifa->ifa_local, NULL, 0))
+				fib_flush();
+		}
+	}
+#undef LOCAL_OK
+#undef BRD_OK
+#undef BRD0_OK
+#undef BRD1_OK
+}
+
+static void fib_disable_ip(struct net_device *dev, int force)
+{
+	if (fib_sync_down(0, dev, force))
+		fib_flush();
+	rt_cache_flush(0);
+	arp_ifdown(dev);
+}
+
+static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr*)ptr;
+
+	switch (event) {
+	case NETDEV_UP:
+		fib_add_ifaddr(ifa);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_sync_up(ifa->ifa_dev->dev);
+#endif
+		rt_cache_flush(-1);
+		break;
+	case NETDEV_DOWN:
+		fib_del_ifaddr(ifa);
+		if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) {
+			/* Last address was deleted from this interface.
+			   Disable IP.
+			 */
+			fib_disable_ip(ifa->ifa_dev->dev, 1);
+		} else {
+			rt_cache_flush(-1);
+		}
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	struct in_device *in_dev = __in_dev_get(dev);
+
+	if (event == NETDEV_UNREGISTER) {
+		fib_disable_ip(dev, 2);
+		return NOTIFY_DONE;
+	}
+
+	if (!in_dev)
+		return NOTIFY_DONE;
+
+	switch (event) {
+	case NETDEV_UP:
+		for_ifa(in_dev) {
+			fib_add_ifaddr(ifa);
+		} endfor_ifa(in_dev);
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		fib_sync_up(dev);
+#endif
+		rt_cache_flush(-1);
+		break;
+	case NETDEV_DOWN:
+		fib_disable_ip(dev, 0);
+		break;
+	case NETDEV_CHANGEMTU:
+	case NETDEV_CHANGE:
+		rt_cache_flush(0);
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block fib_inetaddr_notifier = {
+	.notifier_call =fib_inetaddr_event,
+};
+
+static struct notifier_block fib_netdev_notifier = {
+	.notifier_call =fib_netdev_event,
+};
+
+void __init ip_fib_init(void)
+{
+#ifndef CONFIG_IP_MULTIPLE_TABLES
+	ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL);
+	ip_fib_main_table  = fib_hash_init(RT_TABLE_MAIN);
+#else
+	fib_rules_init();
+#endif
+
+	register_netdevice_notifier(&fib_netdev_notifier);
+	register_inetaddr_notifier(&fib_inetaddr_notifier);
+}
+
+EXPORT_SYMBOL(inet_addr_type);
+EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(ip_rt_ioctl);
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
new file mode 100644
index 000000000000..6506dcc01b46
--- /dev/null
+++ b/net/ipv4/fib_hash.c
@@ -0,0 +1,1086 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 FIB: lookup engine and maintenance routines.
+ *
+ * Version:	$Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#include "fib_lookup.h"
+
+static kmem_cache_t *fn_hash_kmem;
+static kmem_cache_t *fn_alias_kmem;
+
+struct fib_node {
+	struct hlist_node	fn_hash;
+	struct list_head	fn_alias;
+	u32			fn_key;
+};
+
+struct fn_zone {
+	struct fn_zone		*fz_next;	/* Next not empty zone	*/
+	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
+	int			fz_nent;	/* Number of entries	*/
+
+	int			fz_divisor;	/* Hash divisor		*/
+	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
+#define FZ_HASHMASK(fz)		((fz)->fz_hashmask)
+
+	int			fz_order;	/* Zone order		*/
+	u32			fz_mask;
+#define FZ_MASK(fz)		((fz)->fz_mask)
+};
+
+/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
+ * can be cheaper than memory lookup, so that FZ_* macros are used.
+ */
+
+struct fn_hash {
+	struct fn_zone	*fn_zones[33];
+	struct fn_zone	*fn_zone_list;
+};
+
+static inline u32 fn_hash(u32 key, struct fn_zone *fz)
+{
+	u32 h = ntohl(key)>>(32 - fz->fz_order);
+	h ^= (h>>20);
+	h ^= (h>>10);
+	h ^= (h>>5);
+	h &= FZ_HASHMASK(fz);
+	return h;
+}
+
+static inline u32 fz_key(u32 dst, struct fn_zone *fz)
+{
+	return dst & FZ_MASK(fz);
+}
+
+static DEFINE_RWLOCK(fib_hash_lock);
+static unsigned int fib_hash_genid;
+
+#define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
+
+static struct hlist_head *fz_hash_alloc(int divisor)
+{
+	unsigned long size = divisor * sizeof(struct hlist_head);
+
+	if (size <= PAGE_SIZE) {
+		return kmalloc(size, GFP_KERNEL);
+	} else {
+		return (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL, get_order(size));
+	}
+}
+
+/* The fib hash lock must be held when this is called. */
+static inline void fn_rebuild_zone(struct fn_zone *fz,
+				   struct hlist_head *old_ht,
+				   int old_divisor)
+{
+	int i;
+
+	for (i = 0; i < old_divisor; i++) {
+		struct hlist_node *node, *n;
+		struct fib_node *f;
+
+		hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
+			struct hlist_head *new_head;
+
+			hlist_del(&f->fn_hash);
+
+			new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+			hlist_add_head(&f->fn_hash, new_head);
+		}
+	}
+}
+
+static void fz_hash_free(struct hlist_head *hash, int divisor)
+{
+	unsigned long size = divisor * sizeof(struct hlist_head);
+
+	if (size <= PAGE_SIZE)
+		kfree(hash);
+	else
+		free_pages((unsigned long)hash, get_order(size));
+}
+
+static void fn_rehash_zone(struct fn_zone *fz)
+{
+	struct hlist_head *ht, *old_ht;
+	int old_divisor, new_divisor;
+	u32 new_hashmask;
+		
+	old_divisor = fz->fz_divisor;
+
+	switch (old_divisor) {
+	case 16:
+		new_divisor = 256;
+		break;
+	case 256:
+		new_divisor = 1024;
+		break;
+	default:
+		if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
+			printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor);
+			return;
+		}
+		new_divisor = (old_divisor << 1);
+		break;
+	}
+
+	new_hashmask = (new_divisor - 1);
+
+#if RT_CACHE_DEBUG >= 2
+	printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor);
+#endif
+
+	ht = fz_hash_alloc(new_divisor);
+
+	if (ht)	{
+		memset(ht, 0, new_divisor * sizeof(struct hlist_head));
+
+		write_lock_bh(&fib_hash_lock);
+		old_ht = fz->fz_hash;
+		fz->fz_hash = ht;
+		fz->fz_hashmask = new_hashmask;
+		fz->fz_divisor = new_divisor;
+		fn_rebuild_zone(fz, old_ht, old_divisor);
+		fib_hash_genid++;
+		write_unlock_bh(&fib_hash_lock);
+
+		fz_hash_free(old_ht, old_divisor);
+	}
+}
+
+static inline void fn_free_node(struct fib_node * f)
+{
+	kmem_cache_free(fn_hash_kmem, f);
+}
+
+static inline void fn_free_alias(struct fib_alias *fa)
+{
+	fib_release_info(fa->fa_info);
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
+static struct fn_zone *
+fn_new_zone(struct fn_hash *table, int z)
+{
+	int i;
+	struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL);
+	if (!fz)
+		return NULL;
+
+	memset(fz, 0, sizeof(struct fn_zone));
+	if (z) {
+		fz->fz_divisor = 16;
+	} else {
+		fz->fz_divisor = 1;
+	}
+	fz->fz_hashmask = (fz->fz_divisor - 1);
+	fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
+	if (!fz->fz_hash) {
+		kfree(fz);
+		return NULL;
+	}
+	memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *));
+	fz->fz_order = z;
+	fz->fz_mask = inet_make_mask(z);
+
+	/* Find the first not empty zone with more specific mask */
+	for (i=z+1; i<=32; i++)
+		if (table->fn_zones[i])
+			break;
+	write_lock_bh(&fib_hash_lock);
+	if (i>32) {
+		/* No more specific masks, we are the first. */
+		fz->fz_next = table->fn_zone_list;
+		table->fn_zone_list = fz;
+	} else {
+		fz->fz_next = table->fn_zones[i]->fz_next;
+		table->fn_zones[i]->fz_next = fz;
+	}
+	table->fn_zones[z] = fz;
+	fib_hash_genid++;
+	write_unlock_bh(&fib_hash_lock);
+	return fz;
+}
+
+static int
+fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+	int err;
+	struct fn_zone *fz;
+	struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+
+	read_lock(&fib_hash_lock);
+	for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+		struct hlist_head *head;
+		struct hlist_node *node;
+		struct fib_node *f;
+		u32 k = fz_key(flp->fl4_dst, fz);
+
+		head = &fz->fz_hash[fn_hash(k, fz)];
+		hlist_for_each_entry(f, node, head, fn_hash) {
+			if (f->fn_key != k)
+				continue;
+
+			err = fib_semantic_match(&f->fn_alias,
+						 flp, res,
+						 f->fn_key, fz->fz_mask,
+						 fz->fz_order);
+			if (err <= 0)
+				goto out;
+		}
+	}
+	err = 1;
+out:
+	read_unlock(&fib_hash_lock);
+	return err;
+}
+
+static int fn_hash_last_dflt=-1;
+
+static void
+fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+{
+	int order, last_idx;
+	struct hlist_node *node;
+	struct fib_node *f;
+	struct fib_info *fi = NULL;
+	struct fib_info *last_resort;
+	struct fn_hash *t = (struct fn_hash*)tb->tb_data;
+	struct fn_zone *fz = t->fn_zones[0];
+
+	if (fz == NULL)
+		return;
+
+	last_idx = -1;
+	last_resort = NULL;
+	order = -1;
+
+	read_lock(&fib_hash_lock);
+	hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+		struct fib_alias *fa;
+
+		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+			struct fib_info *next_fi = fa->fa_info;
+
+			if (fa->fa_scope != res->scope ||
+			    fa->fa_type != RTN_UNICAST)
+				continue;
+
+			if (next_fi->fib_priority > res->fi->fib_priority)
+				break;
+			if (!next_fi->fib_nh[0].nh_gw ||
+			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
+				continue;
+			fa->fa_state |= FA_S_ACCESSED;
+
+			if (fi == NULL) {
+				if (next_fi != res->fi)
+					break;
+			} else if (!fib_detect_death(fi, order, &last_resort,
+						     &last_idx, &fn_hash_last_dflt)) {
+				if (res->fi)
+					fib_info_put(res->fi);
+				res->fi = fi;
+				atomic_inc(&fi->fib_clntref);
+				fn_hash_last_dflt = order;
+				goto out;
+			}
+			fi = next_fi;
+			order++;
+		}
+	}
+
+	if (order <= 0 || fi == NULL) {
+		fn_hash_last_dflt = -1;
+		goto out;
+	}
+
+	if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) {
+		if (res->fi)
+			fib_info_put(res->fi);
+		res->fi = fi;
+		atomic_inc(&fi->fib_clntref);
+		fn_hash_last_dflt = order;
+		goto out;
+	}
+
+	if (last_idx >= 0) {
+		if (res->fi)
+			fib_info_put(res->fi);
+		res->fi = last_resort;
+		if (last_resort)
+			atomic_inc(&last_resort->fib_clntref);
+	}
+	fn_hash_last_dflt = last_idx;
+out:
+	read_unlock(&fib_hash_lock);
+}
+
+/* Insert node F to FZ. */
+static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
+{
+	struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
+
+	hlist_add_head(&f->fn_hash, head);
+}
+
+/* Return the node in FZ matching KEY. */
+static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key)
+{
+	struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)];
+	struct hlist_node *node;
+	struct fib_node *f;
+
+	hlist_for_each_entry(f, node, head, fn_hash) {
+		if (f->fn_key == key)
+			return f;
+	}
+
+	return NULL;
+}
+
+static int
+fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+	       struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+	struct fib_node *new_f, *f;
+	struct fib_alias *fa, *new_fa;
+	struct fn_zone *fz;
+	struct fib_info *fi;
+	int z = r->rtm_dst_len;
+	int type = r->rtm_type;
+	u8 tos = r->rtm_tos;
+	u32 key;
+	int err;
+
+	if (z > 32)
+		return -EINVAL;
+	fz = table->fn_zones[z];
+	if (!fz && !(fz = fn_new_zone(table, z)))
+		return -ENOBUFS;
+
+	key = 0;
+	if (rta->rta_dst) {
+		u32 dst;
+		memcpy(&dst, rta->rta_dst, 4);
+		if (dst & ~FZ_MASK(fz))
+			return -EINVAL;
+		key = fz_key(dst, fz);
+	}
+
+	if  ((fi = fib_create_info(r, rta, n, &err)) == NULL)
+		return err;
+
+	if (fz->fz_nent > (fz->fz_divisor<<1) &&
+	    fz->fz_divisor < FZ_MAX_DIVISOR &&
+	    (z==32 || (1<<z) > fz->fz_divisor))
+		fn_rehash_zone(fz);
+
+	f = fib_find_node(fz, key);
+
+	if (!f)
+		fa = NULL;
+	else
+		fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority);
+
+	/* Now fa, if non-NULL, points to the first fib alias
+	 * with the same keys [prefix,tos,priority], if such key already
+	 * exists or to the node before which we will insert new one.
+	 *
+	 * If fa is NULL, we will need to allocate a new one and
+	 * insert to the head of f.
+	 *
+	 * If f is NULL, no fib node matched the destination key
+	 * and we need to allocate a new one of those as well.
+	 */
+
+	if (fa && fa->fa_tos == tos &&
+	    fa->fa_info->fib_priority == fi->fib_priority) {
+		struct fib_alias *fa_orig;
+
+		err = -EEXIST;
+		if (n->nlmsg_flags & NLM_F_EXCL)
+			goto out;
+
+		if (n->nlmsg_flags & NLM_F_REPLACE) {
+			struct fib_info *fi_drop;
+			u8 state;
+
+			write_lock_bh(&fib_hash_lock);
+			fi_drop = fa->fa_info;
+			fa->fa_info = fi;
+			fa->fa_type = type;
+			fa->fa_scope = r->rtm_scope;
+			state = fa->fa_state;
+			fa->fa_state &= ~FA_S_ACCESSED;
+			fib_hash_genid++;
+			write_unlock_bh(&fib_hash_lock);
+
+			fib_release_info(fi_drop);
+			if (state & FA_S_ACCESSED)
+				rt_cache_flush(-1);
+			return 0;
+		}
+
+		/* Error if we find a perfect match which
+		 * uses the same scope, type, and nexthop
+		 * information.
+		 */
+		fa_orig = fa;
+		fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+		list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+			if (fa->fa_tos != tos)
+				break;
+			if (fa->fa_info->fib_priority != fi->fib_priority)
+				break;
+			if (fa->fa_type == type &&
+			    fa->fa_scope == r->rtm_scope &&
+			    fa->fa_info == fi)
+				goto out;
+		}
+		if (!(n->nlmsg_flags & NLM_F_APPEND))
+			fa = fa_orig;
+	}
+
+	err = -ENOENT;
+	if (!(n->nlmsg_flags&NLM_F_CREATE))
+		goto out;
+
+	err = -ENOBUFS;
+	new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
+	if (new_fa == NULL)
+		goto out;
+
+	new_f = NULL;
+	if (!f) {
+		new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL);
+		if (new_f == NULL)
+			goto out_free_new_fa;
+
+		INIT_HLIST_NODE(&new_f->fn_hash);
+		INIT_LIST_HEAD(&new_f->fn_alias);
+		new_f->fn_key = key;
+		f = new_f;
+	}
+
+	new_fa->fa_info = fi;
+	new_fa->fa_tos = tos;
+	new_fa->fa_type = type;
+	new_fa->fa_scope = r->rtm_scope;
+	new_fa->fa_state = 0;
+
+	/*
+	 * Insert new entry to the list.
+	 */
+
+	write_lock_bh(&fib_hash_lock);
+	if (new_f)
+		fib_insert_node(fz, new_f);
+	list_add_tail(&new_fa->fa_list,
+		 (fa ? &fa->fa_list : &f->fn_alias));
+	fib_hash_genid++;
+	write_unlock_bh(&fib_hash_lock);
+
+	if (new_f)
+		fz->fz_nent++;
+	rt_cache_flush(-1);
+
+	rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req);
+	return 0;
+
+out_free_new_fa:
+	kmem_cache_free(fn_alias_kmem, new_fa);
+out:
+	fib_release_info(fi);
+	return err;
+}
+
+
+static int
+fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
+	       struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+	struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+	struct fib_node *f;
+	struct fib_alias *fa, *fa_to_delete;
+	int z = r->rtm_dst_len;
+	struct fn_zone *fz;
+	u32 key;
+	u8 tos = r->rtm_tos;
+
+	if (z > 32)
+		return -EINVAL;
+	if ((fz  = table->fn_zones[z]) == NULL)
+		return -ESRCH;
+
+	key = 0;
+	if (rta->rta_dst) {
+		u32 dst;
+		memcpy(&dst, rta->rta_dst, 4);
+		if (dst & ~FZ_MASK(fz))
+			return -EINVAL;
+		key = fz_key(dst, fz);
+	}
+
+	f = fib_find_node(fz, key);
+
+	if (!f)
+		fa = NULL;
+	else
+		fa = fib_find_alias(&f->fn_alias, tos, 0);
+	if (!fa)
+		return -ESRCH;
+
+	fa_to_delete = NULL;
+	fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
+	list_for_each_entry_continue(fa, &f->fn_alias, fa_list) {
+		struct fib_info *fi = fa->fa_info;
+
+		if (fa->fa_tos != tos)
+			break;
+
+		if ((!r->rtm_type ||
+		     fa->fa_type == r->rtm_type) &&
+		    (r->rtm_scope == RT_SCOPE_NOWHERE ||
+		     fa->fa_scope == r->rtm_scope) &&
+		    (!r->rtm_protocol ||
+		     fi->fib_protocol == r->rtm_protocol) &&
+		    fib_nh_match(r, n, rta, fi) == 0) {
+			fa_to_delete = fa;
+			break;
+		}
+	}
+
+	if (fa_to_delete) {
+		int kill_fn;
+
+		fa = fa_to_delete;
+		rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req);
+
+		kill_fn = 0;
+		write_lock_bh(&fib_hash_lock);
+		list_del(&fa->fa_list);
+		if (list_empty(&f->fn_alias)) {
+			hlist_del(&f->fn_hash);
+			kill_fn = 1;
+		}
+		fib_hash_genid++;
+		write_unlock_bh(&fib_hash_lock);
+
+		if (fa->fa_state & FA_S_ACCESSED)
+			rt_cache_flush(-1);
+		fn_free_alias(fa);
+		if (kill_fn) {
+			fn_free_node(f);
+			fz->fz_nent--;
+		}
+
+		return 0;
+	}
+	return -ESRCH;
+}
+
+static int fn_flush_list(struct fn_zone *fz, int idx)
+{
+	struct hlist_head *head = &fz->fz_hash[idx];
+	struct hlist_node *node, *n;
+	struct fib_node *f;
+	int found = 0;
+
+	hlist_for_each_entry_safe(f, node, n, head, fn_hash) {
+		struct fib_alias *fa, *fa_node;
+		int kill_f;
+
+		kill_f = 0;
+		list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) {
+			struct fib_info *fi = fa->fa_info;
+
+			if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
+				write_lock_bh(&fib_hash_lock);
+				list_del(&fa->fa_list);
+				if (list_empty(&f->fn_alias)) {
+					hlist_del(&f->fn_hash);
+					kill_f = 1;
+				}
+				fib_hash_genid++;
+				write_unlock_bh(&fib_hash_lock);
+
+				fn_free_alias(fa);
+				found++;
+			}
+		}
+		if (kill_f) {
+			fn_free_node(f);
+			fz->fz_nent--;
+		}
+	}
+	return found;
+}
+
+static int fn_hash_flush(struct fib_table *tb)
+{
+	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
+	struct fn_zone *fz;
+	int found = 0;
+
+	for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+		int i;
+
+		for (i = fz->fz_divisor - 1; i >= 0; i--)
+			found += fn_flush_list(fz, i);
+	}
+	return found;
+}
+
+
+static inline int
+fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
+		     struct fib_table *tb,
+		     struct fn_zone *fz,
+		     struct hlist_head *head)
+{
+	struct hlist_node *node;
+	struct fib_node *f;
+	int i, s_i;
+
+	s_i = cb->args[3];
+	i = 0;
+	hlist_for_each_entry(f, node, head, fn_hash) {
+		struct fib_alias *fa;
+
+		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+			if (i < s_i)
+				goto next;
+
+			if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
+					  cb->nlh->nlmsg_seq,
+					  RTM_NEWROUTE,
+					  tb->tb_id,
+					  fa->fa_type,
+					  fa->fa_scope,
+					  &f->fn_key,
+					  fz->fz_order,
+					  fa->fa_tos,
+					  fa->fa_info) < 0) {
+				cb->args[3] = i;
+				return -1;
+			}
+		next:
+			i++;
+		}
+	}
+	cb->args[3] = i;
+	return skb->len;
+}
+
+static inline int
+fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
+		   struct fib_table *tb,
+		   struct fn_zone *fz)
+{
+	int h, s_h;
+
+	s_h = cb->args[2];
+	for (h=0; h < fz->fz_divisor; h++) {
+		if (h < s_h) continue;
+		if (h > s_h)
+			memset(&cb->args[3], 0,
+			       sizeof(cb->args) - 3*sizeof(cb->args[0]));
+		if (fz->fz_hash == NULL ||
+		    hlist_empty(&fz->fz_hash[h]))
+			continue;
+		if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) {
+			cb->args[2] = h;
+			return -1;
+		}
+	}
+	cb->args[2] = h;
+	return skb->len;
+}
+
+static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int m, s_m;
+	struct fn_zone *fz;
+	struct fn_hash *table = (struct fn_hash*)tb->tb_data;
+
+	s_m = cb->args[1];
+	read_lock(&fib_hash_lock);
+	for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
+		if (m < s_m) continue;
+		if (m > s_m)
+			memset(&cb->args[2], 0,
+			       sizeof(cb->args) - 2*sizeof(cb->args[0]));
+		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
+			cb->args[1] = m;
+			read_unlock(&fib_hash_lock);
+			return -1;
+		}
+	}
+	read_unlock(&fib_hash_lock);
+	cb->args[1] = m;
+	return skb->len;
+}
+
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+struct fib_table * fib_hash_init(int id)
+#else
+struct fib_table * __init fib_hash_init(int id)
+#endif
+{
+	struct fib_table *tb;
+
+	if (fn_hash_kmem == NULL)
+		fn_hash_kmem = kmem_cache_create("ip_fib_hash",
+						 sizeof(struct fib_node),
+						 0, SLAB_HWCACHE_ALIGN,
+						 NULL, NULL);
+
+	if (fn_alias_kmem == NULL)
+		fn_alias_kmem = kmem_cache_create("ip_fib_alias",
+						  sizeof(struct fib_alias),
+						  0, SLAB_HWCACHE_ALIGN,
+						  NULL, NULL);
+
+	tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash),
+		     GFP_KERNEL);
+	if (tb == NULL)
+		return NULL;
+
+	tb->tb_id = id;
+	tb->tb_lookup = fn_hash_lookup;
+	tb->tb_insert = fn_hash_insert;
+	tb->tb_delete = fn_hash_delete;
+	tb->tb_flush = fn_hash_flush;
+	tb->tb_select_default = fn_hash_select_default;
+	tb->tb_dump = fn_hash_dump;
+	memset(tb->tb_data, 0, sizeof(struct fn_hash));
+	return tb;
+}
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+struct fib_iter_state {
+	struct fn_zone	*zone;
+	int		bucket;
+	struct hlist_head *hash_head;
+	struct fib_node *fn;
+	struct fib_alias *fa;
+	loff_t pos;
+	unsigned int genid;
+	int valid;
+};
+
+static struct fib_alias *fib_get_first(struct seq_file *seq)
+{
+	struct fib_iter_state *iter = seq->private;
+	struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data;
+
+	iter->bucket    = 0;
+	iter->hash_head = NULL;
+	iter->fn        = NULL;
+	iter->fa        = NULL;
+	iter->pos	= 0;
+	iter->genid	= fib_hash_genid;
+	iter->valid	= 1;
+
+	for (iter->zone = table->fn_zone_list; iter->zone;
+	     iter->zone = iter->zone->fz_next) {
+		int maxslot;
+
+		if (!iter->zone->fz_nent)
+			continue;
+
+		iter->hash_head = iter->zone->fz_hash;
+		maxslot = iter->zone->fz_divisor;
+
+		for (iter->bucket = 0; iter->bucket < maxslot;
+		     ++iter->bucket, ++iter->hash_head) {
+			struct hlist_node *node;
+			struct fib_node *fn;
+
+			hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) {
+				struct fib_alias *fa;
+
+				list_for_each_entry(fa,&fn->fn_alias,fa_list) {
+					iter->fn = fn;
+					iter->fa = fa;
+					goto out;
+				}
+			}
+		}
+	}
+out:
+	return iter->fa;
+}
+
+static struct fib_alias *fib_get_next(struct seq_file *seq)
+{
+	struct fib_iter_state *iter = seq->private;
+	struct fib_node *fn;
+	struct fib_alias *fa;
+
+	/* Advance FA, if any. */
+	fn = iter->fn;
+	fa = iter->fa;
+	if (fa) {
+		BUG_ON(!fn);
+		list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) {
+			iter->fa = fa;
+			goto out;
+		}
+	}
+
+	fa = iter->fa = NULL;
+
+	/* Advance FN. */
+	if (fn) {
+		struct hlist_node *node = &fn->fn_hash;
+		hlist_for_each_entry_continue(fn, node, fn_hash) {
+			iter->fn = fn;
+
+			list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+				iter->fa = fa;
+				goto out;
+			}
+		}
+	}
+
+	fn = iter->fn = NULL;
+
+	/* Advance hash chain. */
+	if (!iter->zone)
+		goto out;
+
+	for (;;) {
+		struct hlist_node *node;
+		int maxslot;
+
+		maxslot = iter->zone->fz_divisor;
+
+		while (++iter->bucket < maxslot) {
+			iter->hash_head++;
+
+			hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+				list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+					iter->fn = fn;
+					iter->fa = fa;
+					goto out;
+				}
+			}
+		}
+
+		iter->zone = iter->zone->fz_next;
+
+		if (!iter->zone)
+			goto out;
+		
+		iter->bucket = 0;
+		iter->hash_head = iter->zone->fz_hash;
+
+		hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) {
+			list_for_each_entry(fa, &fn->fn_alias, fa_list) {
+				iter->fn = fn;
+				iter->fa = fa;
+				goto out;
+			}
+		}
+	}
+out:
+	iter->pos++;
+	return fa;
+}
+
+static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct fib_iter_state *iter = seq->private;
+	struct fib_alias *fa;
+	
+	if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) {
+		fa   = iter->fa;
+		pos -= iter->pos;
+	} else
+		fa = fib_get_first(seq);
+
+	if (fa)
+		while (pos && (fa = fib_get_next(seq)))
+			--pos;
+	return pos ? NULL : fa;
+}
+
+static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	void *v = NULL;
+
+	read_lock(&fib_hash_lock);
+	if (ip_fib_main_table)
+		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+	return v;
+}
+
+static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	++*pos;
+	return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq);
+}
+
+static void fib_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock(&fib_hash_lock);
+}
+
+static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi)
+{
+	static unsigned type2flags[RTN_MAX + 1] = {
+		[7] = RTF_REJECT, [8] = RTF_REJECT,
+	};
+	unsigned flags = type2flags[type];
+
+	if (fi && fi->fib_nh->nh_gw)
+		flags |= RTF_GATEWAY;
+	if (mask == 0xFFFFFFFF)
+		flags |= RTF_HOST;
+	flags |= RTF_UP;
+	return flags;
+}
+
+/* 
+ *	This outputs /proc/net/route.
+ *
+ *	It always works in backward compatibility mode.
+ *	The format of the file is not supposed to be changed.
+ */
+static int fib_seq_show(struct seq_file *seq, void *v)
+{
+	struct fib_iter_state *iter;
+	char bf[128];
+	u32 prefix, mask;
+	unsigned flags;
+	struct fib_node *f;
+	struct fib_alias *fa;
+	struct fib_info *fi;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
+			   "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
+			   "\tWindow\tIRTT");
+		goto out;
+	}
+
+	iter	= seq->private;
+	f	= iter->fn;
+	fa	= iter->fa;
+	fi	= fa->fa_info;
+	prefix	= f->fn_key;
+	mask	= FZ_MASK(iter->zone);
+	flags	= fib_flag_trans(fa->fa_type, mask, fi);
+	if (fi)
+		snprintf(bf, sizeof(bf),
+			 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+			 fi->fib_dev ? fi->fib_dev->name : "*", prefix,
+			 fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority,
+			 mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
+			 fi->fib_window,
+			 fi->fib_rtt >> 3);
+	else
+		snprintf(bf, sizeof(bf),
+			 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
+			 prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0);
+	seq_printf(seq, "%-127s\n", bf);
+out:
+	return 0;
+}
+
+static struct seq_operations fib_seq_ops = {
+	.start  = fib_seq_start,
+	.next   = fib_seq_next,
+	.stop   = fib_seq_stop,
+	.show   = fib_seq_show,
+};
+
+static int fib_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+	if (!s)
+		goto out;
+
+	rc = seq_open(file, &fib_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq	     = file->private_data;
+	seq->private = s;
+	memset(s, 0, sizeof(*s));
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations fib_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open           = fib_seq_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release	= seq_release_private,
+};
+
+int __init fib_proc_init(void)
+{
+	if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+void __init fib_proc_exit(void)
+{
+	proc_net_remove("route");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
new file mode 100644
index 000000000000..ac4485f75e97
--- /dev/null
+++ b/net/ipv4/fib_lookup.h
@@ -0,0 +1,43 @@
+#ifndef _FIB_LOOKUP_H
+#define _FIB_LOOKUP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <net/ip_fib.h>
+
+struct fib_alias {
+	struct list_head	fa_list;
+	struct fib_info		*fa_info;
+	u8			fa_tos;
+	u8			fa_type;
+	u8			fa_scope;
+	u8			fa_state;
+};
+
+#define FA_S_ACCESSED	0x01
+
+/* Exported by fib_semantics.c */
+extern int fib_semantic_match(struct list_head *head,
+			      const struct flowi *flp,
+			      struct fib_result *res, __u32 zone, __u32 mask,
+				int prefixlen);
+extern void fib_release_info(struct fib_info *);
+extern struct fib_info *fib_create_info(const struct rtmsg *r,
+					struct kern_rta *rta,
+					const struct nlmsghdr *,
+					int *err);
+extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *,
+			struct kern_rta *rta, struct fib_info *fi);
+extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+			 u8 tb_id, u8 type, u8 scope, void *dst,
+			 int dst_len, u8 tos, struct fib_info *fi);
+extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+		      int z, int tb_id,
+		      struct nlmsghdr *n, struct netlink_skb_parms *req);
+extern struct fib_alias *fib_find_alias(struct list_head *fah,
+					u8 tos, u32 prio);
+extern int fib_detect_death(struct fib_info *fi, int order,
+			    struct fib_info **last_resort,
+			    int *last_idx, int *dflt);
+
+#endif /* _FIB_LOOKUP_H */
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
new file mode 100644
index 000000000000..39d0aadb9a2a
--- /dev/null
+++ b/net/ipv4/fib_rules.c
@@ -0,0 +1,437 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: policy rules.
+ *
+ * Version:	$Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ *
+ * Fixes:
+ * 		Rani Assaf	:	local_rule cannot be deleted
+ *		Marc Boucher	:	routing by fwmark
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+
+#define FRprintk(a...)
+
+struct fib_rule
+{
+	struct fib_rule *r_next;
+	atomic_t	r_clntref;
+	u32		r_preference;
+	unsigned char	r_table;
+	unsigned char	r_action;
+	unsigned char	r_dst_len;
+	unsigned char	r_src_len;
+	u32		r_src;
+	u32		r_srcmask;
+	u32		r_dst;
+	u32		r_dstmask;
+	u32		r_srcmap;
+	u8		r_flags;
+	u8		r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	u32		r_fwmark;
+#endif
+	int		r_ifindex;
+#ifdef CONFIG_NET_CLS_ROUTE
+	__u32		r_tclassid;
+#endif
+	char		r_ifname[IFNAMSIZ];
+	int		r_dead;
+};
+
+static struct fib_rule default_rule = {
+	.r_clntref =	ATOMIC_INIT(2),
+	.r_preference =	0x7FFF,
+	.r_table =	RT_TABLE_DEFAULT,
+	.r_action =	RTN_UNICAST,
+};
+
+static struct fib_rule main_rule = {
+	.r_next =	&default_rule,
+	.r_clntref =	ATOMIC_INIT(2),
+	.r_preference =	0x7FFE,
+	.r_table =	RT_TABLE_MAIN,
+	.r_action =	RTN_UNICAST,
+};
+
+static struct fib_rule local_rule = {
+	.r_next =	&main_rule,
+	.r_clntref =	ATOMIC_INIT(2),
+	.r_table =	RT_TABLE_LOCAL,
+	.r_action =	RTN_UNICAST,
+};
+
+static struct fib_rule *fib_rules = &local_rule;
+static DEFINE_RWLOCK(fib_rules_lock);
+
+int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct rtattr **rta = arg;
+	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct fib_rule *r, **rp;
+	int err = -ESRCH;
+
+	for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) {
+		if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) &&
+		    rtm->rtm_src_len == r->r_src_len &&
+		    rtm->rtm_dst_len == r->r_dst_len &&
+		    (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) &&
+		    rtm->rtm_tos == r->r_tos &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		    (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) &&
+#endif
+		    (!rtm->rtm_type || rtm->rtm_type == r->r_action) &&
+		    (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) &&
+		    (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) &&
+		    (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) {
+			err = -EPERM;
+			if (r == &local_rule)
+				break;
+
+			write_lock_bh(&fib_rules_lock);
+			*rp = r->r_next;
+			r->r_dead = 1;
+			write_unlock_bh(&fib_rules_lock);
+			fib_rule_put(r);
+			err = 0;
+			break;
+		}
+	}
+	return err;
+}
+
+/* Allocate new unique table id */
+
+static struct fib_table *fib_empty_table(void)
+{
+	int id;
+
+	for (id = 1; id <= RT_TABLE_MAX; id++)
+		if (fib_tables[id] == NULL)
+			return __fib_new_table(id);
+	return NULL;
+}
+
+void fib_rule_put(struct fib_rule *r)
+{
+	if (atomic_dec_and_test(&r->r_clntref)) {
+		if (r->r_dead)
+			kfree(r);
+		else
+			printk("Freeing alive rule %p\n", r);
+	}
+}
+
+int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct rtattr **rta = arg;
+	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct fib_rule *r, *new_r, **rp;
+	unsigned char table_id;
+
+	if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 ||
+	    (rtm->rtm_tos & ~IPTOS_TOS_MASK))
+		return -EINVAL;
+
+	if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ)
+		return -EINVAL;
+
+	table_id = rtm->rtm_table;
+	if (table_id == RT_TABLE_UNSPEC) {
+		struct fib_table *table;
+		if (rtm->rtm_type == RTN_UNICAST) {
+			if ((table = fib_empty_table()) == NULL)
+				return -ENOBUFS;
+			table_id = table->tb_id;
+		}
+	}
+
+	new_r = kmalloc(sizeof(*new_r), GFP_KERNEL);
+	if (!new_r)
+		return -ENOMEM;
+	memset(new_r, 0, sizeof(*new_r));
+	if (rta[RTA_SRC-1])
+		memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4);
+	if (rta[RTA_DST-1])
+		memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4);
+	if (rta[RTA_GATEWAY-1])
+		memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4);
+	new_r->r_src_len = rtm->rtm_src_len;
+	new_r->r_dst_len = rtm->rtm_dst_len;
+	new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len);
+	new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len);
+	new_r->r_tos = rtm->rtm_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	if (rta[RTA_PROTOINFO-1])
+		memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4);
+#endif
+	new_r->r_action = rtm->rtm_type;
+	new_r->r_flags = rtm->rtm_flags;
+	if (rta[RTA_PRIORITY-1])
+		memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4);
+	new_r->r_table = table_id;
+	if (rta[RTA_IIF-1]) {
+		struct net_device *dev;
+		rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ);
+		new_r->r_ifindex = -1;
+		dev = __dev_get_by_name(new_r->r_ifname);
+		if (dev)
+			new_r->r_ifindex = dev->ifindex;
+	}
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (rta[RTA_FLOW-1])
+		memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4);
+#endif
+
+	rp = &fib_rules;
+	if (!new_r->r_preference) {
+		r = fib_rules;
+		if (r && (r = r->r_next) != NULL) {
+			rp = &fib_rules->r_next;
+			if (r->r_preference)
+				new_r->r_preference = r->r_preference - 1;
+		}
+	}
+
+	while ( (r = *rp) != NULL ) {
+		if (r->r_preference > new_r->r_preference)
+			break;
+		rp = &r->r_next;
+	}
+
+	new_r->r_next = r;
+	atomic_inc(&new_r->r_clntref);
+	write_lock_bh(&fib_rules_lock);
+	*rp = new_r;
+	write_unlock_bh(&fib_rules_lock);
+	return 0;
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+u32 fib_rules_tclass(struct fib_result *res)
+{
+	if (res->r)
+		return res->r->r_tclassid;
+	return 0;
+}
+#endif
+
+
+static void fib_rules_detach(struct net_device *dev)
+{
+	struct fib_rule *r;
+
+	for (r=fib_rules; r; r=r->r_next) {
+		if (r->r_ifindex == dev->ifindex) {
+			write_lock_bh(&fib_rules_lock);
+			r->r_ifindex = -1;
+			write_unlock_bh(&fib_rules_lock);
+		}
+	}
+}
+
+static void fib_rules_attach(struct net_device *dev)
+{
+	struct fib_rule *r;
+
+	for (r=fib_rules; r; r=r->r_next) {
+		if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) {
+			write_lock_bh(&fib_rules_lock);
+			r->r_ifindex = dev->ifindex;
+			write_unlock_bh(&fib_rules_lock);
+		}
+	}
+}
+
+int fib_lookup(const struct flowi *flp, struct fib_result *res)
+{
+	int err;
+	struct fib_rule *r, *policy;
+	struct fib_table *tb;
+
+	u32 daddr = flp->fl4_dst;
+	u32 saddr = flp->fl4_src;
+
+FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ",
+	NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src));
+	read_lock(&fib_rules_lock);
+	for (r = fib_rules; r; r=r->r_next) {
+		if (((saddr^r->r_src) & r->r_srcmask) ||
+		    ((daddr^r->r_dst) & r->r_dstmask) ||
+		    (r->r_tos && r->r_tos != flp->fl4_tos) ||
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		    (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) ||
+#endif
+		    (r->r_ifindex && r->r_ifindex != flp->iif))
+			continue;
+
+FRprintk("tb %d r %d ", r->r_table, r->r_action);
+		switch (r->r_action) {
+		case RTN_UNICAST:
+			policy = r;
+			break;
+		case RTN_UNREACHABLE:
+			read_unlock(&fib_rules_lock);
+			return -ENETUNREACH;
+		default:
+		case RTN_BLACKHOLE:
+			read_unlock(&fib_rules_lock);
+			return -EINVAL;
+		case RTN_PROHIBIT:
+			read_unlock(&fib_rules_lock);
+			return -EACCES;
+		}
+
+		if ((tb = fib_get_table(r->r_table)) == NULL)
+			continue;
+		err = tb->tb_lookup(tb, flp, res);
+		if (err == 0) {
+			res->r = policy;
+			if (policy)
+				atomic_inc(&policy->r_clntref);
+			read_unlock(&fib_rules_lock);
+			return 0;
+		}
+		if (err < 0 && err != -EAGAIN) {
+			read_unlock(&fib_rules_lock);
+			return err;
+		}
+	}
+FRprintk("FAILURE\n");
+	read_unlock(&fib_rules_lock);
+	return -ENETUNREACH;
+}
+
+void fib_select_default(const struct flowi *flp, struct fib_result *res)
+{
+	if (res->r && res->r->r_action == RTN_UNICAST &&
+	    FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) {
+		struct fib_table *tb;
+		if ((tb = fib_get_table(res->r->r_table)) != NULL)
+			tb->tb_select_default(tb, flp, res);
+	}
+}
+
+static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (event == NETDEV_UNREGISTER)
+		fib_rules_detach(dev);
+	else if (event == NETDEV_REGISTER)
+		fib_rules_attach(dev);
+	return NOTIFY_DONE;
+}
+
+
+static struct notifier_block fib_rules_notifier = {
+	.notifier_call =fib_rules_event,
+};
+
+static __inline__ int inet_fill_rule(struct sk_buff *skb,
+				     struct fib_rule *r,
+				     struct netlink_callback *cb)
+{
+	struct rtmsg *rtm;
+	struct nlmsghdr  *nlh;
+	unsigned char	 *b = skb->tail;
+
+	nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm));
+	rtm = NLMSG_DATA(nlh);
+	rtm->rtm_family = AF_INET;
+	rtm->rtm_dst_len = r->r_dst_len;
+	rtm->rtm_src_len = r->r_src_len;
+	rtm->rtm_tos = r->r_tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	if (r->r_fwmark)
+		RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark);
+#endif
+	rtm->rtm_table = r->r_table;
+	rtm->rtm_protocol = 0;
+	rtm->rtm_scope = 0;
+	rtm->rtm_type = r->r_action;
+	rtm->rtm_flags = r->r_flags;
+
+	if (r->r_dst_len)
+		RTA_PUT(skb, RTA_DST, 4, &r->r_dst);
+	if (r->r_src_len)
+		RTA_PUT(skb, RTA_SRC, 4, &r->r_src);
+	if (r->r_ifname[0])
+		RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname);
+	if (r->r_preference)
+		RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference);
+	if (r->r_srcmap)
+		RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap);
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (r->r_tclassid)
+		RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid);
+#endif
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int idx;
+	int s_idx = cb->args[0];
+	struct fib_rule *r;
+
+	read_lock(&fib_rules_lock);
+	for (r=fib_rules, idx=0; r; r = r->r_next, idx++) {
+		if (idx < s_idx)
+			continue;
+		if (inet_fill_rule(skb, r, cb) < 0)
+			break;
+	}
+	read_unlock(&fib_rules_lock);
+	cb->args[0] = idx;
+
+	return skb->len;
+}
+
+void __init fib_rules_init(void)
+{
+	register_netdevice_notifier(&fib_rules_notifier);
+}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
new file mode 100644
index 000000000000..029362d66135
--- /dev/null
+++ b/net/ipv4/fib_semantics.c
@@ -0,0 +1,1332 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		IPv4 Forwarding Information Base: semantics.
+ *
+ * Version:	$Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/init.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+
+#include "fib_lookup.h"
+
+#define FSprintk(a...)
+
+static DEFINE_RWLOCK(fib_info_lock);
+static struct hlist_head *fib_info_hash;
+static struct hlist_head *fib_info_laddrhash;
+static unsigned int fib_hash_size;
+static unsigned int fib_info_cnt;
+
+#define DEVINDEX_HASHBITS 8
+#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
+static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static DEFINE_SPINLOCK(fib_multipath_lock);
+
+#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
+for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
+for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+/* Hope, that gcc will optimize it to get rid of dummy loop */
+
+#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
+for (nhsel=0; nhsel < 1; nhsel++)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define endfor_nexthops(fi) }
+
+
+static struct 
+{
+	int	error;
+	u8	scope;
+} fib_props[RTA_MAX + 1] = {
+        {
+		.error	= 0,
+		.scope	= RT_SCOPE_NOWHERE,
+	},	/* RTN_UNSPEC */
+	{
+		.error	= 0,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},	/* RTN_UNICAST */
+	{
+		.error	= 0,
+		.scope	= RT_SCOPE_HOST,
+	},	/* RTN_LOCAL */
+	{
+		.error	= 0,
+		.scope	= RT_SCOPE_LINK,
+	},	/* RTN_BROADCAST */
+	{
+		.error	= 0,
+		.scope	= RT_SCOPE_LINK,
+	},	/* RTN_ANYCAST */
+	{
+		.error	= 0,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},	/* RTN_MULTICAST */
+	{
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},	/* RTN_BLACKHOLE */
+	{
+		.error	= -EHOSTUNREACH,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},	/* RTN_UNREACHABLE */
+	{
+		.error	= -EACCES,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},	/* RTN_PROHIBIT */
+	{
+		.error	= -EAGAIN,
+		.scope	= RT_SCOPE_UNIVERSE,
+	},	/* RTN_THROW */
+	{
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_NOWHERE,
+	},	/* RTN_NAT */
+	{
+		.error	= -EINVAL,
+		.scope	= RT_SCOPE_NOWHERE,
+	},	/* RTN_XRESOLVE */
+};
+
+
+/* Release a nexthop info record */
+
+void free_fib_info(struct fib_info *fi)
+{
+	if (fi->fib_dead == 0) {
+		printk("Freeing alive fib_info %p\n", fi);
+		return;
+	}
+	change_nexthops(fi) {
+		if (nh->nh_dev)
+			dev_put(nh->nh_dev);
+		nh->nh_dev = NULL;
+	} endfor_nexthops(fi);
+	fib_info_cnt--;
+	kfree(fi);
+}
+
+void fib_release_info(struct fib_info *fi)
+{
+	write_lock(&fib_info_lock);
+	if (fi && --fi->fib_treeref == 0) {
+		hlist_del(&fi->fib_hash);
+		if (fi->fib_prefsrc)
+			hlist_del(&fi->fib_lhash);
+		change_nexthops(fi) {
+			if (!nh->nh_dev)
+				continue;
+			hlist_del(&nh->nh_hash);
+		} endfor_nexthops(fi)
+		fi->fib_dead = 1;
+		fib_info_put(fi);
+	}
+	write_unlock(&fib_info_lock);
+}
+
+static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+{
+	const struct fib_nh *onh = ofi->fib_nh;
+
+	for_nexthops(fi) {
+		if (nh->nh_oif != onh->nh_oif ||
+		    nh->nh_gw  != onh->nh_gw ||
+		    nh->nh_scope != onh->nh_scope ||
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		    nh->nh_weight != onh->nh_weight ||
+#endif
+#ifdef CONFIG_NET_CLS_ROUTE
+		    nh->nh_tclassid != onh->nh_tclassid ||
+#endif
+		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+			return -1;
+		onh++;
+	} endfor_nexthops(fi);
+	return 0;
+}
+
+static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
+{
+	unsigned int mask = (fib_hash_size - 1);
+	unsigned int val = fi->fib_nhs;
+
+	val ^= fi->fib_protocol;
+	val ^= fi->fib_prefsrc;
+	val ^= fi->fib_priority;
+
+	return (val ^ (val >> 7) ^ (val >> 12)) & mask;
+}
+
+static struct fib_info *fib_find_info(const struct fib_info *nfi)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct fib_info *fi;
+	unsigned int hash;
+
+	hash = fib_info_hashfn(nfi);
+	head = &fib_info_hash[hash];
+
+	hlist_for_each_entry(fi, node, head, fib_hash) {
+		if (fi->fib_nhs != nfi->fib_nhs)
+			continue;
+		if (nfi->fib_protocol == fi->fib_protocol &&
+		    nfi->fib_prefsrc == fi->fib_prefsrc &&
+		    nfi->fib_priority == fi->fib_priority &&
+		    memcmp(nfi->fib_metrics, fi->fib_metrics,
+			   sizeof(fi->fib_metrics)) == 0 &&
+		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
+			return fi;
+	}
+
+	return NULL;
+}
+
+static inline unsigned int fib_devindex_hashfn(unsigned int val)
+{
+	unsigned int mask = DEVINDEX_HASHSIZE - 1;
+
+	return (val ^
+		(val >> DEVINDEX_HASHBITS) ^
+		(val >> (DEVINDEX_HASHBITS * 2))) & mask;
+}
+
+/* Check, that the gateway is already configured.
+   Used only by redirect accept routine.
+ */
+
+int ip_fib_check_default(u32 gw, struct net_device *dev)
+{
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct fib_nh *nh;
+	unsigned int hash;
+
+	read_lock(&fib_info_lock);
+
+	hash = fib_devindex_hashfn(dev->ifindex);
+	head = &fib_info_devhash[hash];
+	hlist_for_each_entry(nh, node, head, nh_hash) {
+		if (nh->nh_dev == dev &&
+		    nh->nh_gw == gw &&
+		    !(nh->nh_flags&RTNH_F_DEAD)) {
+			read_unlock(&fib_info_lock);
+			return 0;
+		}
+	}
+
+	read_unlock(&fib_info_lock);
+
+	return -1;
+}
+
+void rtmsg_fib(int event, u32 key, struct fib_alias *fa,
+	       int z, int tb_id,
+	       struct nlmsghdr *n, struct netlink_skb_parms *req)
+{
+	struct sk_buff *skb;
+	u32 pid = req ? req->pid : 0;
+	int size = NLMSG_SPACE(sizeof(struct rtmsg)+256);
+
+	skb = alloc_skb(size, GFP_KERNEL);
+	if (!skb)
+		return;
+
+	if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id,
+			  fa->fa_type, fa->fa_scope, &key, z,
+			  fa->fa_tos,
+			  fa->fa_info) < 0) {
+		kfree_skb(skb);
+		return;
+	}
+	NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE;
+	if (n->nlmsg_flags&NLM_F_ECHO)
+		atomic_inc(&skb->users);
+	netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL);
+	if (n->nlmsg_flags&NLM_F_ECHO)
+		netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT);
+}
+
+/* Return the first fib alias matching TOS with
+ * priority less than or equal to PRIO.
+ */
+struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
+{
+	if (fah) {
+		struct fib_alias *fa;
+		list_for_each_entry(fa, fah, fa_list) {
+			if (fa->fa_tos > tos)
+				continue;
+			if (fa->fa_info->fib_priority >= prio ||
+			    fa->fa_tos < tos)
+				return fa;
+		}
+	}
+	return NULL;
+}
+
+int fib_detect_death(struct fib_info *fi, int order,
+		     struct fib_info **last_resort, int *last_idx, int *dflt)
+{
+	struct neighbour *n;
+	int state = NUD_NONE;
+
+	n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
+	if (n) {
+		state = n->nud_state;
+		neigh_release(n);
+	}
+	if (state==NUD_REACHABLE)
+		return 0;
+	if ((state&NUD_VALID) && order != *dflt)
+		return 0;
+	if ((state&NUD_VALID) ||
+	    (*last_idx<0 && order > *dflt)) {
+		*last_resort = fi;
+		*last_idx = order;
+	}
+	return 1;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type)
+{
+	while (RTA_OK(attr,attrlen)) {
+		if (attr->rta_type == type)
+			return *(u32*)RTA_DATA(attr);
+		attr = RTA_NEXT(attr, attrlen);
+	}
+	return 0;
+}
+
+static int
+fib_count_nexthops(struct rtattr *rta)
+{
+	int nhs = 0;
+	struct rtnexthop *nhp = RTA_DATA(rta);
+	int nhlen = RTA_PAYLOAD(rta);
+
+	while (nhlen >= (int)sizeof(struct rtnexthop)) {
+		if ((nhlen -= nhp->rtnh_len) < 0)
+			return 0;
+		nhs++;
+		nhp = RTNH_NEXT(nhp);
+	};
+	return nhs;
+}
+
+static int
+fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r)
+{
+	struct rtnexthop *nhp = RTA_DATA(rta);
+	int nhlen = RTA_PAYLOAD(rta);
+
+	change_nexthops(fi) {
+		int attrlen = nhlen - sizeof(struct rtnexthop);
+		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+			return -EINVAL;
+		nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags;
+		nh->nh_oif = nhp->rtnh_ifindex;
+		nh->nh_weight = nhp->rtnh_hops + 1;
+		if (attrlen) {
+			nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+#ifdef CONFIG_NET_CLS_ROUTE
+			nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+#endif
+		}
+		nhp = RTNH_NEXT(nhp);
+	} endfor_nexthops(fi);
+	return 0;
+}
+
+#endif
+
+int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta,
+		 struct fib_info *fi)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	struct rtnexthop *nhp;
+	int nhlen;
+#endif
+
+	if (rta->rta_priority &&
+	    *rta->rta_priority != fi->fib_priority)
+		return 1;
+
+	if (rta->rta_oif || rta->rta_gw) {
+		if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) &&
+		    (!rta->rta_gw  || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0))
+			return 0;
+		return 1;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (rta->rta_mp == NULL)
+		return 0;
+	nhp = RTA_DATA(rta->rta_mp);
+	nhlen = RTA_PAYLOAD(rta->rta_mp);
+	
+	for_nexthops(fi) {
+		int attrlen = nhlen - sizeof(struct rtnexthop);
+		u32 gw;
+
+		if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0)
+			return -EINVAL;
+		if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif)
+			return 1;
+		if (attrlen) {
+			gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY);
+			if (gw && gw != nh->nh_gw)
+				return 1;
+#ifdef CONFIG_NET_CLS_ROUTE
+			gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW);
+			if (gw && gw != nh->nh_tclassid)
+				return 1;
+#endif
+		}
+		nhp = RTNH_NEXT(nhp);
+	} endfor_nexthops(fi);
+#endif
+	return 0;
+}
+
+
+/*
+   Picture
+   -------
+
+   Semantics of nexthop is very messy by historical reasons.
+   We have to take into account, that:
+   a) gateway can be actually local interface address,
+      so that gatewayed route is direct.
+   b) gateway must be on-link address, possibly
+      described not by an ifaddr, but also by a direct route.
+   c) If both gateway and interface are specified, they should not
+      contradict.
+   d) If we use tunnel routes, gateway could be not on-link.
+
+   Attempt to reconcile all of these (alas, self-contradictory) conditions
+   results in pretty ugly and hairy code with obscure logic.
+
+   I chose to generalized it instead, so that the size
+   of code does not increase practically, but it becomes
+   much more general.
+   Every prefix is assigned a "scope" value: "host" is local address,
+   "link" is direct route,
+   [ ... "site" ... "interior" ... ]
+   and "universe" is true gateway route with global meaning.
+
+   Every prefix refers to a set of "nexthop"s (gw, oif),
+   where gw must have narrower scope. This recursion stops
+   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+   which means that gw is forced to be on link.
+
+   Code is still hairy, but now it is apparently logically
+   consistent and very flexible. F.e. as by-product it allows
+   to co-exists in peace independent exterior and interior
+   routing processes.
+
+   Normally it looks as following.
+
+   {universe prefix}  -> (gw, oif) [scope link]
+                          |
+			  |-> {link prefix} -> (gw, oif) [scope local]
+			                        |
+						|-> {local prefix} (terminal node)
+ */
+
+static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh)
+{
+	int err;
+
+	if (nh->nh_gw) {
+		struct fib_result res;
+
+#ifdef CONFIG_IP_ROUTE_PERVASIVE
+		if (nh->nh_flags&RTNH_F_PERVASIVE)
+			return 0;
+#endif
+		if (nh->nh_flags&RTNH_F_ONLINK) {
+			struct net_device *dev;
+
+			if (r->rtm_scope >= RT_SCOPE_LINK)
+				return -EINVAL;
+			if (inet_addr_type(nh->nh_gw) != RTN_UNICAST)
+				return -EINVAL;
+			if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL)
+				return -ENODEV;
+			if (!(dev->flags&IFF_UP))
+				return -ENETDOWN;
+			nh->nh_dev = dev;
+			dev_hold(dev);
+			nh->nh_scope = RT_SCOPE_LINK;
+			return 0;
+		}
+		{
+			struct flowi fl = { .nl_u = { .ip4_u =
+						      { .daddr = nh->nh_gw,
+							.scope = r->rtm_scope + 1 } },
+					    .oif = nh->nh_oif };
+
+			/* It is not necessary, but requires a bit of thinking */
+			if (fl.fl4_scope < RT_SCOPE_LINK)
+				fl.fl4_scope = RT_SCOPE_LINK;
+			if ((err = fib_lookup(&fl, &res)) != 0)
+				return err;
+		}
+		err = -EINVAL;
+		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
+			goto out;
+		nh->nh_scope = res.scope;
+		nh->nh_oif = FIB_RES_OIF(res);
+		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+			goto out;
+		dev_hold(nh->nh_dev);
+		err = -ENETDOWN;
+		if (!(nh->nh_dev->flags & IFF_UP))
+			goto out;
+		err = 0;
+out:
+		fib_res_put(&res);
+		return err;
+	} else {
+		struct in_device *in_dev;
+
+		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+			return -EINVAL;
+
+		in_dev = inetdev_by_index(nh->nh_oif);
+		if (in_dev == NULL)
+			return -ENODEV;
+		if (!(in_dev->dev->flags&IFF_UP)) {
+			in_dev_put(in_dev);
+			return -ENETDOWN;
+		}
+		nh->nh_dev = in_dev->dev;
+		dev_hold(nh->nh_dev);
+		nh->nh_scope = RT_SCOPE_HOST;
+		in_dev_put(in_dev);
+	}
+	return 0;
+}
+
+static inline unsigned int fib_laddr_hashfn(u32 val)
+{
+	unsigned int mask = (fib_hash_size - 1);
+
+	return (val ^ (val >> 7) ^ (val >> 14)) & mask;
+}
+
+static struct hlist_head *fib_hash_alloc(int bytes)
+{
+	if (bytes <= PAGE_SIZE)
+		return kmalloc(bytes, GFP_KERNEL);
+	else
+		return (struct hlist_head *)
+			__get_free_pages(GFP_KERNEL, get_order(bytes));
+}
+
+static void fib_hash_free(struct hlist_head *hash, int bytes)
+{
+	if (!hash)
+		return;
+
+	if (bytes <= PAGE_SIZE)
+		kfree(hash);
+	else
+		free_pages((unsigned long) hash, get_order(bytes));
+}
+
+static void fib_hash_move(struct hlist_head *new_info_hash,
+			  struct hlist_head *new_laddrhash,
+			  unsigned int new_size)
+{
+	unsigned int old_size = fib_hash_size;
+	unsigned int i;
+
+	write_lock(&fib_info_lock);
+	fib_hash_size = new_size;
+
+	for (i = 0; i < old_size; i++) {
+		struct hlist_head *head = &fib_info_hash[i];
+		struct hlist_node *node, *n;
+		struct fib_info *fi;
+
+		hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
+			struct hlist_head *dest;
+			unsigned int new_hash;
+
+			hlist_del(&fi->fib_hash);
+
+			new_hash = fib_info_hashfn(fi);
+			dest = &new_info_hash[new_hash];
+			hlist_add_head(&fi->fib_hash, dest);
+		}
+	}
+	fib_info_hash = new_info_hash;
+
+	for (i = 0; i < old_size; i++) {
+		struct hlist_head *lhead = &fib_info_laddrhash[i];
+		struct hlist_node *node, *n;
+		struct fib_info *fi;
+
+		hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
+			struct hlist_head *ldest;
+			unsigned int new_hash;
+
+			hlist_del(&fi->fib_lhash);
+
+			new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
+			ldest = &new_laddrhash[new_hash];
+			hlist_add_head(&fi->fib_lhash, ldest);
+		}
+	}
+	fib_info_laddrhash = new_laddrhash;
+
+	write_unlock(&fib_info_lock);
+}
+
+struct fib_info *
+fib_create_info(const struct rtmsg *r, struct kern_rta *rta,
+		const struct nlmsghdr *nlh, int *errp)
+{
+	int err;
+	struct fib_info *fi = NULL;
+	struct fib_info *ofi;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	int nhs = 1;
+#else
+	const int nhs = 1;
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	u32 mp_alg = IP_MP_ALG_NONE;
+#endif
+
+	/* Fast check to catch the most weird cases */
+	if (fib_props[r->rtm_type].scope > r->rtm_scope)
+		goto err_inval;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (rta->rta_mp) {
+		nhs = fib_count_nexthops(rta->rta_mp);
+		if (nhs == 0)
+			goto err_inval;
+	}
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if (rta->rta_mp_alg) {
+		mp_alg = *rta->rta_mp_alg;
+
+		if (mp_alg < IP_MP_ALG_NONE ||
+		    mp_alg > IP_MP_ALG_MAX)
+			goto err_inval;
+	}
+#endif
+
+	err = -ENOBUFS;
+	if (fib_info_cnt >= fib_hash_size) {
+		unsigned int new_size = fib_hash_size << 1;
+		struct hlist_head *new_info_hash;
+		struct hlist_head *new_laddrhash;
+		unsigned int bytes;
+
+		if (!new_size)
+			new_size = 1;
+		bytes = new_size * sizeof(struct hlist_head *);
+		new_info_hash = fib_hash_alloc(bytes);
+		new_laddrhash = fib_hash_alloc(bytes);
+		if (!new_info_hash || !new_laddrhash) {
+			fib_hash_free(new_info_hash, bytes);
+			fib_hash_free(new_laddrhash, bytes);
+		} else {
+			memset(new_info_hash, 0, bytes);
+			memset(new_laddrhash, 0, bytes);
+
+			fib_hash_move(new_info_hash, new_laddrhash, new_size);
+		}
+
+		if (!fib_hash_size)
+			goto failure;
+	}
+
+	fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
+	if (fi == NULL)
+		goto failure;
+	fib_info_cnt++;
+	memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh));
+
+	fi->fib_protocol = r->rtm_protocol;
+
+	fi->fib_nhs = nhs;
+	change_nexthops(fi) {
+		nh->nh_parent = fi;
+	} endfor_nexthops(fi)
+
+	fi->fib_flags = r->rtm_flags;
+	if (rta->rta_priority)
+		fi->fib_priority = *rta->rta_priority;
+	if (rta->rta_mx) {
+		int attrlen = RTA_PAYLOAD(rta->rta_mx);
+		struct rtattr *attr = RTA_DATA(rta->rta_mx);
+
+		while (RTA_OK(attr, attrlen)) {
+			unsigned flavor = attr->rta_type;
+			if (flavor) {
+				if (flavor > RTAX_MAX)
+					goto err_inval;
+				fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr);
+			}
+			attr = RTA_NEXT(attr, attrlen);
+		}
+	}
+	if (rta->rta_prefsrc)
+		memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4);
+
+	if (rta->rta_mp) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0)
+			goto failure;
+		if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif)
+			goto err_inval;
+		if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4))
+			goto err_inval;
+#ifdef CONFIG_NET_CLS_ROUTE
+		if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4))
+			goto err_inval;
+#endif
+#else
+		goto err_inval;
+#endif
+	} else {
+		struct fib_nh *nh = fi->fib_nh;
+		if (rta->rta_oif)
+			nh->nh_oif = *rta->rta_oif;
+		if (rta->rta_gw)
+			memcpy(&nh->nh_gw, rta->rta_gw, 4);
+#ifdef CONFIG_NET_CLS_ROUTE
+		if (rta->rta_flow)
+			memcpy(&nh->nh_tclassid, rta->rta_flow, 4);
+#endif
+		nh->nh_flags = r->rtm_flags;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+		nh->nh_weight = 1;
+#endif
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	fi->fib_mp_alg = mp_alg;
+#endif
+
+	if (fib_props[r->rtm_type].error) {
+		if (rta->rta_gw || rta->rta_oif || rta->rta_mp)
+			goto err_inval;
+		goto link_it;
+	}
+
+	if (r->rtm_scope > RT_SCOPE_HOST)
+		goto err_inval;
+
+	if (r->rtm_scope == RT_SCOPE_HOST) {
+		struct fib_nh *nh = fi->fib_nh;
+
+		/* Local address is added. */
+		if (nhs != 1 || nh->nh_gw)
+			goto err_inval;
+		nh->nh_scope = RT_SCOPE_NOWHERE;
+		nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif);
+		err = -ENODEV;
+		if (nh->nh_dev == NULL)
+			goto failure;
+	} else {
+		change_nexthops(fi) {
+			if ((err = fib_check_nh(r, fi, nh)) != 0)
+				goto failure;
+		} endfor_nexthops(fi)
+	}
+
+	if (fi->fib_prefsrc) {
+		if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL ||
+		    memcmp(&fi->fib_prefsrc, rta->rta_dst, 4))
+			if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL)
+				goto err_inval;
+	}
+
+link_it:
+	if ((ofi = fib_find_info(fi)) != NULL) {
+		fi->fib_dead = 1;
+		free_fib_info(fi);
+		ofi->fib_treeref++;
+		return ofi;
+	}
+
+	fi->fib_treeref++;
+	atomic_inc(&fi->fib_clntref);
+	write_lock(&fib_info_lock);
+	hlist_add_head(&fi->fib_hash,
+		       &fib_info_hash[fib_info_hashfn(fi)]);
+	if (fi->fib_prefsrc) {
+		struct hlist_head *head;
+
+		head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
+		hlist_add_head(&fi->fib_lhash, head);
+	}
+	change_nexthops(fi) {
+		struct hlist_head *head;
+		unsigned int hash;
+
+		if (!nh->nh_dev)
+			continue;
+		hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
+		head = &fib_info_devhash[hash];
+		hlist_add_head(&nh->nh_hash, head);
+	} endfor_nexthops(fi)
+	write_unlock(&fib_info_lock);
+	return fi;
+
+err_inval:
+	err = -EINVAL;
+
+failure:
+        *errp = err;
+        if (fi) {
+		fi->fib_dead = 1;
+		free_fib_info(fi);
+	}
+	return NULL;
+}
+
+int fib_semantic_match(struct list_head *head, const struct flowi *flp,
+		       struct fib_result *res, __u32 zone, __u32 mask, 
+			int prefixlen)
+{
+	struct fib_alias *fa;
+	int nh_sel = 0;
+
+	list_for_each_entry(fa, head, fa_list) {
+		int err;
+
+		if (fa->fa_tos &&
+		    fa->fa_tos != flp->fl4_tos)
+			continue;
+
+		if (fa->fa_scope < flp->fl4_scope)
+			continue;
+
+		fa->fa_state |= FA_S_ACCESSED;
+
+		err = fib_props[fa->fa_type].error;
+		if (err == 0) {
+			struct fib_info *fi = fa->fa_info;
+
+			if (fi->fib_flags & RTNH_F_DEAD)
+				continue;
+
+			switch (fa->fa_type) {
+			case RTN_UNICAST:
+			case RTN_LOCAL:
+			case RTN_BROADCAST:
+			case RTN_ANYCAST:
+			case RTN_MULTICAST:
+				for_nexthops(fi) {
+					if (nh->nh_flags&RTNH_F_DEAD)
+						continue;
+					if (!flp->oif || flp->oif == nh->nh_oif)
+						break;
+				}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+				if (nhsel < fi->fib_nhs) {
+					nh_sel = nhsel;
+					goto out_fill_res;
+				}
+#else
+				if (nhsel < 1) {
+					goto out_fill_res;
+				}
+#endif
+				endfor_nexthops(fi);
+				continue;
+
+			default:
+				printk(KERN_DEBUG "impossible 102\n");
+				return -EINVAL;
+			};
+		}
+		return err;
+	}
+	return 1;
+
+out_fill_res:
+	res->prefixlen = prefixlen;
+	res->nh_sel = nh_sel;
+	res->type = fa->fa_type;
+	res->scope = fa->fa_scope;
+	res->fi = fa->fa_info;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	res->netmask = mask;
+	res->network = zone &
+		(0xFFFFFFFF >> (32 - prefixlen));
+#endif
+	atomic_inc(&res->fi->fib_clntref);
+	return 0;
+}
+
+/* Find appropriate source address to this destination */
+
+u32 __fib_res_prefsrc(struct fib_result *res)
+{
+	return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
+}
+
+int
+fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+	      u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos,
+	      struct fib_info *fi)
+{
+	struct rtmsg *rtm;
+	struct nlmsghdr  *nlh;
+	unsigned char	 *b = skb->tail;
+
+	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm));
+	rtm = NLMSG_DATA(nlh);
+	rtm->rtm_family = AF_INET;
+	rtm->rtm_dst_len = dst_len;
+	rtm->rtm_src_len = 0;
+	rtm->rtm_tos = tos;
+	rtm->rtm_table = tb_id;
+	rtm->rtm_type = type;
+	rtm->rtm_flags = fi->fib_flags;
+	rtm->rtm_scope = scope;
+	if (rtm->rtm_dst_len)
+		RTA_PUT(skb, RTA_DST, 4, dst);
+	rtm->rtm_protocol = fi->fib_protocol;
+	if (fi->fib_priority)
+		RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority);
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (fi->fib_nh[0].nh_tclassid)
+		RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid);
+#endif
+	if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
+		goto rtattr_failure;
+	if (fi->fib_prefsrc)
+		RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc);
+	if (fi->fib_nhs == 1) {
+		if (fi->fib_nh->nh_gw)
+			RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw);
+		if (fi->fib_nh->nh_oif)
+			RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif);
+	}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (fi->fib_nhs > 1) {
+		struct rtnexthop *nhp;
+		struct rtattr *mp_head;
+		if (skb_tailroom(skb) <= RTA_SPACE(0))
+			goto rtattr_failure;
+		mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0));
+
+		for_nexthops(fi) {
+			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+				goto rtattr_failure;
+			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+			nhp->rtnh_flags = nh->nh_flags & 0xFF;
+			nhp->rtnh_hops = nh->nh_weight-1;
+			nhp->rtnh_ifindex = nh->nh_oif;
+			if (nh->nh_gw)
+				RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw);
+			nhp->rtnh_len = skb->tail - (unsigned char*)nhp;
+		} endfor_nexthops(fi);
+		mp_head->rta_type = RTA_MULTIPATH;
+		mp_head->rta_len = skb->tail - (u8*)mp_head;
+	}
+#endif
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+#ifndef CONFIG_IP_NOSIOCRT
+
+int
+fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
+		    struct kern_rta *rta, struct rtentry *r)
+{
+	int    plen;
+	u32    *ptr;
+
+	memset(rtm, 0, sizeof(*rtm));
+	memset(rta, 0, sizeof(*rta));
+
+	if (r->rt_dst.sa_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	/* Check mask for validity:
+	   a) it must be contiguous.
+	   b) destination must have all host bits clear.
+	   c) if application forgot to set correct family (AF_INET),
+	      reject request unless it is absolutely clear i.e.
+	      both family and mask are zero.
+	 */
+	plen = 32;
+	ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr;
+	if (!(r->rt_flags&RTF_HOST)) {
+		u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr;
+		if (r->rt_genmask.sa_family != AF_INET) {
+			if (mask || r->rt_genmask.sa_family)
+				return -EAFNOSUPPORT;
+		}
+		if (bad_mask(mask, *ptr))
+			return -EINVAL;
+		plen = inet_mask_len(mask);
+	}
+
+	nl->nlmsg_flags = NLM_F_REQUEST;
+	nl->nlmsg_pid = 0;
+	nl->nlmsg_seq = 0;
+	nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm));
+	if (cmd == SIOCDELRT) {
+		nl->nlmsg_type = RTM_DELROUTE;
+		nl->nlmsg_flags = 0;
+	} else {
+		nl->nlmsg_type = RTM_NEWROUTE;
+		nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE;
+		rtm->rtm_protocol = RTPROT_BOOT;
+	}
+
+	rtm->rtm_dst_len = plen;
+	rta->rta_dst = ptr;
+
+	if (r->rt_metric) {
+		*(u32*)&r->rt_pad3 = r->rt_metric - 1;
+		rta->rta_priority = (u32*)&r->rt_pad3;
+	}
+	if (r->rt_flags&RTF_REJECT) {
+		rtm->rtm_scope = RT_SCOPE_HOST;
+		rtm->rtm_type = RTN_UNREACHABLE;
+		return 0;
+	}
+	rtm->rtm_scope = RT_SCOPE_NOWHERE;
+	rtm->rtm_type = RTN_UNICAST;
+
+	if (r->rt_dev) {
+		char *colon;
+		struct net_device *dev;
+		char   devname[IFNAMSIZ];
+
+		if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1))
+			return -EFAULT;
+		devname[IFNAMSIZ-1] = 0;
+		colon = strchr(devname, ':');
+		if (colon)
+			*colon = 0;
+		dev = __dev_get_by_name(devname);
+		if (!dev)
+			return -ENODEV;
+		rta->rta_oif = &dev->ifindex;
+		if (colon) {
+			struct in_ifaddr *ifa;
+			struct in_device *in_dev = __in_dev_get(dev);
+			if (!in_dev)
+				return -ENODEV;
+			*colon = ':';
+			for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next)
+				if (strcmp(ifa->ifa_label, devname) == 0)
+					break;
+			if (ifa == NULL)
+				return -ENODEV;
+			rta->rta_prefsrc = &ifa->ifa_local;
+		}
+	}
+
+	ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr;
+	if (r->rt_gateway.sa_family == AF_INET && *ptr) {
+		rta->rta_gw = ptr;
+		if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST)
+			rtm->rtm_scope = RT_SCOPE_UNIVERSE;
+	}
+
+	if (cmd == SIOCDELRT)
+		return 0;
+
+	if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL)
+		return -EINVAL;
+
+	if (rtm->rtm_scope == RT_SCOPE_NOWHERE)
+		rtm->rtm_scope = RT_SCOPE_LINK;
+
+	if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) {
+		struct rtattr *rec;
+		struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL);
+		if (mx == NULL)
+			return -ENOMEM;
+		rta->rta_mx = mx;
+		mx->rta_type = RTA_METRICS;
+		mx->rta_len  = RTA_LENGTH(0);
+		if (r->rt_flags&RTF_MTU) {
+			rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+			rec->rta_type = RTAX_ADVMSS;
+			rec->rta_len = RTA_LENGTH(4);
+			mx->rta_len += RTA_LENGTH(4);
+			*(u32*)RTA_DATA(rec) = r->rt_mtu - 40;
+		}
+		if (r->rt_flags&RTF_WINDOW) {
+			rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+			rec->rta_type = RTAX_WINDOW;
+			rec->rta_len = RTA_LENGTH(4);
+			mx->rta_len += RTA_LENGTH(4);
+			*(u32*)RTA_DATA(rec) = r->rt_window;
+		}
+		if (r->rt_flags&RTF_IRTT) {
+			rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len));
+			rec->rta_type = RTAX_RTT;
+			rec->rta_len = RTA_LENGTH(4);
+			mx->rta_len += RTA_LENGTH(4);
+			*(u32*)RTA_DATA(rec) = r->rt_irtt<<3;
+		}
+	}
+	return 0;
+}
+
+#endif
+
+/*
+   Update FIB if:
+   - local address disappeared -> we must delete all the entries
+     referring to it.
+   - device went down -> we must shutdown all nexthops going via it.
+ */
+
+int fib_sync_down(u32 local, struct net_device *dev, int force)
+{
+	int ret = 0;
+	int scope = RT_SCOPE_NOWHERE;
+	
+	if (force)
+		scope = -1;
+
+	if (local && fib_info_laddrhash) {
+		unsigned int hash = fib_laddr_hashfn(local);
+		struct hlist_head *head = &fib_info_laddrhash[hash];
+		struct hlist_node *node;
+		struct fib_info *fi;
+
+		hlist_for_each_entry(fi, node, head, fib_lhash) {
+			if (fi->fib_prefsrc == local) {
+				fi->fib_flags |= RTNH_F_DEAD;
+				ret++;
+			}
+		}
+	}
+
+	if (dev) {
+		struct fib_info *prev_fi = NULL;
+		unsigned int hash = fib_devindex_hashfn(dev->ifindex);
+		struct hlist_head *head = &fib_info_devhash[hash];
+		struct hlist_node *node;
+		struct fib_nh *nh;
+
+		hlist_for_each_entry(nh, node, head, nh_hash) {
+			struct fib_info *fi = nh->nh_parent;
+			int dead;
+
+			BUG_ON(!fi->fib_nhs);
+			if (nh->nh_dev != dev || fi == prev_fi)
+				continue;
+			prev_fi = fi;
+			dead = 0;
+			change_nexthops(fi) {
+				if (nh->nh_flags&RTNH_F_DEAD)
+					dead++;
+				else if (nh->nh_dev == dev &&
+					 nh->nh_scope != scope) {
+					nh->nh_flags |= RTNH_F_DEAD;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+					spin_lock_bh(&fib_multipath_lock);
+					fi->fib_power -= nh->nh_power;
+					nh->nh_power = 0;
+					spin_unlock_bh(&fib_multipath_lock);
+#endif
+					dead++;
+				}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+				if (force > 1 && nh->nh_dev == dev) {
+					dead = fi->fib_nhs;
+					break;
+				}
+#endif
+			} endfor_nexthops(fi)
+			if (dead == fi->fib_nhs) {
+				fi->fib_flags |= RTNH_F_DEAD;
+				ret++;
+			}
+		}
+	}
+
+	return ret;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/*
+   Dead device goes up. We wake up dead nexthops.
+   It takes sense only on multipath routes.
+ */
+
+int fib_sync_up(struct net_device *dev)
+{
+	struct fib_info *prev_fi;
+	unsigned int hash;
+	struct hlist_head *head;
+	struct hlist_node *node;
+	struct fib_nh *nh;
+	int ret;
+
+	if (!(dev->flags&IFF_UP))
+		return 0;
+
+	prev_fi = NULL;
+	hash = fib_devindex_hashfn(dev->ifindex);
+	head = &fib_info_devhash[hash];
+	ret = 0;
+
+	hlist_for_each_entry(nh, node, head, nh_hash) {
+		struct fib_info *fi = nh->nh_parent;
+		int alive;
+
+		BUG_ON(!fi->fib_nhs);
+		if (nh->nh_dev != dev || fi == prev_fi)
+			continue;
+
+		prev_fi = fi;
+		alive = 0;
+		change_nexthops(fi) {
+			if (!(nh->nh_flags&RTNH_F_DEAD)) {
+				alive++;
+				continue;
+			}
+			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+				continue;
+			if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+				continue;
+			alive++;
+			spin_lock_bh(&fib_multipath_lock);
+			nh->nh_power = 0;
+			nh->nh_flags &= ~RTNH_F_DEAD;
+			spin_unlock_bh(&fib_multipath_lock);
+		} endfor_nexthops(fi)
+
+		if (alive > 0) {
+			fi->fib_flags &= ~RTNH_F_DEAD;
+			ret++;
+		}
+	}
+
+	return ret;
+}
+
+/*
+   The algorithm is suboptimal, but it provides really
+   fair weighted route distribution.
+ */
+
+void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
+{
+	struct fib_info *fi = res->fi;
+	int w;
+
+	spin_lock_bh(&fib_multipath_lock);
+	if (fi->fib_power <= 0) {
+		int power = 0;
+		change_nexthops(fi) {
+			if (!(nh->nh_flags&RTNH_F_DEAD)) {
+				power += nh->nh_weight;
+				nh->nh_power = nh->nh_weight;
+			}
+		} endfor_nexthops(fi);
+		fi->fib_power = power;
+		if (power <= 0) {
+			spin_unlock_bh(&fib_multipath_lock);
+			/* Race condition: route has just become dead. */
+			res->nh_sel = 0;
+			return;
+		}
+	}
+
+
+	/* w should be random number [0..fi->fib_power-1],
+	   it is pretty bad approximation.
+	 */
+
+	w = jiffies % fi->fib_power;
+
+	change_nexthops(fi) {
+		if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
+			if ((w -= nh->nh_power) <= 0) {
+				nh->nh_power--;
+				fi->fib_power--;
+				res->nh_sel = nhsel;
+				spin_unlock_bh(&fib_multipath_lock);
+				return;
+			}
+		}
+	} endfor_nexthops(fi);
+
+	/* Race condition: route has just become dead. */
+	res->nh_sel = 0;
+	spin_unlock_bh(&fib_multipath_lock);
+}
+#endif
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
new file mode 100644
index 000000000000..85bf0d3e294b
--- /dev/null
+++ b/net/ipv4/icmp.c
@@ -0,0 +1,1143 @@
+/*
+ *	NET3:	Implementation of the ICMP protocol layer.
+ *
+ *		Alan Cox, <alan@redhat.com>
+ *
+ *	Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Some of the function names and the icmp unreach table for this
+ *	module were derived from [icmp.c 1.0.11 06/02/93] by
+ *	Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting.
+ *	Other than that this module is a complete rewrite.
+ *
+ *	Fixes:
+ *	Clemens Fruhwirth	:	introduce global icmp rate limiting
+ *					with icmp type masking ability instead
+ *					of broken per type icmp timeouts.
+ *		Mike Shaver	:	RFC1122 checks.
+ *		Alan Cox	:	Multicast ping reply as self.
+ *		Alan Cox	:	Fix atomicity lockup in ip_build_xmit
+ *					call.
+ *		Alan Cox	:	Added 216,128 byte paths to the MTU
+ *					code.
+ *		Martin Mares	:	RFC1812 checks.
+ *		Martin Mares	:	Can be configured to follow redirects
+ *					if acting as a router _without_ a
+ *					routing protocol (RFC 1812).
+ *		Martin Mares	:	Echo requests may be configured to
+ *					be ignored (RFC 1812).
+ *		Martin Mares	:	Limitation of ICMP error message
+ *					transmit rate (RFC 1812).
+ *		Martin Mares	:	TOS and Precedence set correctly
+ *					(RFC 1812).
+ *		Martin Mares	:	Now copying as much data from the
+ *					original packet as we can without
+ *					exceeding 576 bytes (RFC 1812).
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		Keith Owens	:	RFC1191 correction for 4.2BSD based
+ *					path MTU bug.
+ *		Thomas Quinot	:	ICMP Dest Unreach codes up to 15 are
+ *					valid (RFC 1812).
+ *		Andi Kleen	:	Check all packet lengths properly
+ *					and moved all kfree_skb() up to
+ *					icmp_rcv.
+ *		Andi Kleen	:	Move the rate limit bookkeeping
+ *					into the dest entry and use a token
+ *					bucket filter (thanks to ANK). Make
+ *					the rates sysctl configurable.
+ *		Yu Tianli	:	Fixed two ugly bugs in icmp_send
+ *					- IP option length was accounted wrongly
+ *					- ICMP header length was not accounted
+ *					  at all.
+ *              Tristan Greaves :       Added sysctl option to ignore bogus
+ *              			broadcast responses from broken routers.
+ *
+ * To Fix:
+ *
+ *	- Should use skb_pull() instead of all the manual checking.
+ *	  This would also greatly simply some upper layer error handlers. --AK
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/string.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/protocol.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/init.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+
+/*
+ *	Build xmit assembly blocks
+ */
+
+struct icmp_bxm {
+	struct sk_buff *skb;
+	int offset;
+	int data_len;
+
+	struct {
+		struct icmphdr icmph;
+		__u32	       times[3];
+	} data;
+	int head_len;
+	struct ip_options replyopts;
+	unsigned char  optbuf[40];
+};
+
+/*
+ *	Statistics
+ */
+DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics);
+
+/* An array of errno for error messages from dest unreach. */
+/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
+
+struct icmp_err icmp_err_convert[] = {
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNREACH */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_UNREACH */
+		.fatal = 0,
+	},
+	{
+		.errno = ENOPROTOOPT	/* ICMP_PROT_UNREACH */,
+		.fatal = 1,
+	},
+	{
+		.errno = ECONNREFUSED,	/* ICMP_PORT_UNREACH */
+		.fatal = 1,
+	},
+	{
+		.errno = EMSGSIZE,	/* ICMP_FRAG_NEEDED */
+		.fatal = 0,
+	},
+	{
+		.errno = EOPNOTSUPP,	/* ICMP_SR_FAILED */
+		.fatal = 0,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNKNOWN */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTDOWN,	/* ICMP_HOST_UNKNOWN */
+		.fatal = 1,
+	},
+	{
+		.errno = ENONET,	/* ICMP_HOST_ISOLATED */
+		.fatal = 1,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_ANO	*/
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_ANO */
+		.fatal = 1,
+	},
+	{
+		.errno = ENETUNREACH,	/* ICMP_NET_UNR_TOS */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_HOST_UNR_TOS */
+		.fatal = 0,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PKT_FILTERED */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PREC_VIOLATION */
+		.fatal = 1,
+	},
+	{
+		.errno = EHOSTUNREACH,	/* ICMP_PREC_CUTOFF */
+		.fatal = 1,
+	},
+};
+
+/* Control parameters for ECHO replies. */
+int sysctl_icmp_echo_ignore_all;
+int sysctl_icmp_echo_ignore_broadcasts;
+
+/* Control parameter - ignore bogus broadcast responses? */
+int sysctl_icmp_ignore_bogus_error_responses;
+
+/*
+ * 	Configurable global rate limit.
+ *
+ *	ratelimit defines tokens/packet consumed for dst->rate_token bucket
+ *	ratemask defines which icmp types are ratelimited by setting
+ * 	it's bit position.
+ *
+ *	default:
+ *	dest unreachable (3), source quench (4),
+ *	time exceeded (11), parameter problem (12)
+ */
+
+int sysctl_icmp_ratelimit = 1 * HZ;
+int sysctl_icmp_ratemask = 0x1818;
+
+/*
+ *	ICMP control array. This specifies what to do with each ICMP.
+ */
+
+struct icmp_control {
+	int output_entry;	/* Field for increment on output */
+	int input_entry;	/* Field for increment on input */
+	void (*handler)(struct sk_buff *skb);
+	short   error;		/* This ICMP is classed as an error message */
+};
+
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
+
+/*
+ *	The ICMP socket(s). This is the most convenient way to flow control
+ *	our ICMP output as well as maintain a clean interface throughout
+ *	all layers. All Socketless IP sends will soon be gone.
+ *
+ *	On SMP we have one ICMP socket per-cpu.
+ */
+static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL;
+#define icmp_socket	__get_cpu_var(__icmp_socket)
+
+static __inline__ int icmp_xmit_lock(void)
+{
+	local_bh_disable();
+
+	if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) {
+		/* This can happen if the output path signals a
+		 * dst_link_failure() for an outgoing ICMP packet.
+		 */
+		local_bh_enable();
+		return 1;
+	}
+	return 0;
+}
+
+static void icmp_xmit_unlock(void)
+{
+	spin_unlock_bh(&icmp_socket->sk->sk_lock.slock);
+}
+
+/*
+ *	Send an ICMP frame.
+ */
+
+/*
+ *	Check transmit rate limitation for given message.
+ *	The rate information is held in the destination cache now.
+ *	This function is generic and could be used for other purposes
+ *	too. It uses a Token bucket filter as suggested by Alexey Kuznetsov.
+ *
+ *	Note that the same dst_entry fields are modified by functions in
+ *	route.c too, but these work for packet destinations while xrlim_allow
+ *	works for icmp destinations. This means the rate limiting information
+ *	for one "ip object" is shared - and these ICMPs are twice limited:
+ *	by source and by destination.
+ *
+ *	RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate
+ *			  SHOULD allow setting of rate limits
+ *
+ * 	Shared between ICMPv4 and ICMPv6.
+ */
+#define XRLIM_BURST_FACTOR 6
+int xrlim_allow(struct dst_entry *dst, int timeout)
+{
+	unsigned long now;
+	int rc = 0;
+
+	now = jiffies;
+	dst->rate_tokens += now - dst->rate_last;
+	dst->rate_last = now;
+	if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout)
+		dst->rate_tokens = XRLIM_BURST_FACTOR * timeout;
+	if (dst->rate_tokens >= timeout) {
+		dst->rate_tokens -= timeout;
+		rc = 1;
+	}
+	return rc;
+}
+
+static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code)
+{
+	struct dst_entry *dst = &rt->u.dst;
+	int rc = 1;
+
+	if (type > NR_ICMP_TYPES)
+		goto out;
+
+	/* Don't limit PMTU discovery. */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED)
+		goto out;
+
+	/* No rate limit on loopback */
+	if (dst->dev && (dst->dev->flags&IFF_LOOPBACK))
+ 		goto out;
+
+	/* Limit if icmp type is enabled in ratemask. */
+	if ((1 << type) & sysctl_icmp_ratemask)
+		rc = xrlim_allow(dst, sysctl_icmp_ratelimit);
+out:
+	return rc;
+}
+
+/*
+ *	Maintain the counters used in the SNMP statistics for outgoing ICMP
+ */
+static void icmp_out_count(int type)
+{
+	if (type <= NR_ICMP_TYPES) {
+		ICMP_INC_STATS(icmp_pointers[type].output_entry);
+		ICMP_INC_STATS(ICMP_MIB_OUTMSGS);
+	}
+}
+
+/*
+ *	Checksum each fragment, and on the first include the headers and final
+ *	checksum.
+ */
+static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
+			  struct sk_buff *skb)
+{
+	struct icmp_bxm *icmp_param = (struct icmp_bxm *)from;
+	unsigned int csum;
+
+	csum = skb_copy_and_csum_bits(icmp_param->skb,
+				      icmp_param->offset + offset,
+				      to, len, 0);
+
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	if (icmp_pointers[icmp_param->data.icmph.type].error)
+		nf_ct_attach(skb, icmp_param->skb);
+	return 0;
+}
+
+static void icmp_push_reply(struct icmp_bxm *icmp_param,
+			    struct ipcm_cookie *ipc, struct rtable *rt)
+{
+	struct sk_buff *skb;
+
+	ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param,
+		       icmp_param->data_len+icmp_param->head_len,
+		       icmp_param->head_len,
+		       ipc, rt, MSG_DONTWAIT);
+
+	if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) {
+		struct icmphdr *icmph = skb->h.icmph;
+		unsigned int csum = 0;
+		struct sk_buff *skb1;
+
+		skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) {
+			csum = csum_add(csum, skb1->csum);
+		}
+		csum = csum_partial_copy_nocheck((void *)&icmp_param->data,
+						 (char *)icmph,
+						 icmp_param->head_len, csum);
+		icmph->checksum = csum_fold(csum);
+		skb->ip_summed = CHECKSUM_NONE;
+		ip_push_pending_frames(icmp_socket->sk);
+	}
+}
+
+/*
+ *	Driving logic for building and sending ICMP messages.
+ */
+
+static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
+{
+	struct sock *sk = icmp_socket->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct rtable *rt = (struct rtable *)skb->dst;
+	u32 daddr;
+
+	if (ip_options_echo(&icmp_param->replyopts, skb))
+		goto out;
+
+	if (icmp_xmit_lock())
+		return;
+
+	icmp_param->data.icmph.checksum = 0;
+	icmp_out_count(icmp_param->data.icmph.type);
+
+	inet->tos = skb->nh.iph->tos;
+	daddr = ipc.addr = rt->rt_src;
+	ipc.opt = NULL;
+	if (icmp_param->replyopts.optlen) {
+		ipc.opt = &icmp_param->replyopts;
+		if (ipc.opt->srr)
+			daddr = icmp_param->replyopts.faddr;
+	}
+	{
+		struct flowi fl = { .nl_u = { .ip4_u =
+					      { .daddr = daddr,
+						.saddr = rt->rt_spec_dst,
+						.tos = RT_TOS(skb->nh.iph->tos) } },
+				    .proto = IPPROTO_ICMP };
+		if (ip_route_output_key(&rt, &fl))
+			goto out_unlock;
+	}
+	if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type,
+			       icmp_param->data.icmph.code))
+		icmp_push_reply(icmp_param, &ipc, rt);
+	ip_rt_put(rt);
+out_unlock:
+	icmp_xmit_unlock();
+out:;
+}
+
+
+/*
+ *	Send an ICMP message in response to a situation
+ *
+ *	RFC 1122: 3.2.2	MUST send at least the IP header and 8 bytes of header.
+ *		  MAY send more (we do).
+ *			MUST NOT change this header information.
+ *			MUST NOT reply to a multicast/broadcast IP address.
+ *			MUST NOT reply to a multicast/broadcast MAC address.
+ *			MUST reply to only the first fragment.
+ */
+
+void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info)
+{
+	struct iphdr *iph;
+	int room;
+	struct icmp_bxm icmp_param;
+	struct rtable *rt = (struct rtable *)skb_in->dst;
+	struct ipcm_cookie ipc;
+	u32 saddr;
+	u8  tos;
+
+	if (!rt)
+		goto out;
+
+	/*
+	 *	Find the original header. It is expected to be valid, of course.
+	 *	Check this, icmp_send is called from the most obscure devices
+	 *	sometimes.
+	 */
+	iph = skb_in->nh.iph;
+
+	if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail)
+		goto out;
+
+	/*
+	 *	No replies to physical multicast/broadcast
+	 */
+	if (skb_in->pkt_type != PACKET_HOST)
+		goto out;
+
+	/*
+	 *	Now check at the protocol level
+	 */
+	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto out;
+
+	/*
+	 *	Only reply to fragment 0. We byte re-order the constant
+	 *	mask for efficiency.
+	 */
+	if (iph->frag_off & htons(IP_OFFSET))
+		goto out;
+
+	/*
+	 *	If we send an ICMP error to an ICMP error a mess would result..
+	 */
+	if (icmp_pointers[type].error) {
+		/*
+		 *	We are an error, check if we are replying to an
+		 *	ICMP error
+		 */
+		if (iph->protocol == IPPROTO_ICMP) {
+			u8 _inner_type, *itp;
+
+			itp = skb_header_pointer(skb_in,
+						 skb_in->nh.raw +
+						 (iph->ihl << 2) +
+						 offsetof(struct icmphdr,
+							  type) -
+						 skb_in->data,
+						 sizeof(_inner_type),
+						 &_inner_type);
+			if (itp == NULL)
+				goto out;
+
+			/*
+			 *	Assume any unknown ICMP type is an error. This
+			 *	isn't specified by the RFC, but think about it..
+			 */
+			if (*itp > NR_ICMP_TYPES ||
+			    icmp_pointers[*itp].error)
+				goto out;
+		}
+	}
+
+	if (icmp_xmit_lock())
+		return;
+
+	/*
+	 *	Construct source address and options.
+	 */
+
+	saddr = iph->daddr;
+	if (!(rt->rt_flags & RTCF_LOCAL))
+		saddr = 0;
+
+	tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
+					   IPTOS_PREC_INTERNETCONTROL) :
+					  iph->tos;
+
+	if (ip_options_echo(&icmp_param.replyopts, skb_in))
+		goto ende;
+
+
+	/*
+	 *	Prepare data for ICMP header.
+	 */
+
+	icmp_param.data.icmph.type	 = type;
+	icmp_param.data.icmph.code	 = code;
+	icmp_param.data.icmph.un.gateway = info;
+	icmp_param.data.icmph.checksum	 = 0;
+	icmp_param.skb	  = skb_in;
+	icmp_param.offset = skb_in->nh.raw - skb_in->data;
+	icmp_out_count(icmp_param.data.icmph.type);
+	inet_sk(icmp_socket->sk)->tos = tos;
+	ipc.addr = iph->saddr;
+	ipc.opt = &icmp_param.replyopts;
+
+	{
+		struct flowi fl = {
+			.nl_u = {
+				.ip4_u = {
+					.daddr = icmp_param.replyopts.srr ?
+						icmp_param.replyopts.faddr :
+						iph->saddr,
+					.saddr = saddr,
+					.tos = RT_TOS(tos)
+				}
+			},
+			.proto = IPPROTO_ICMP,
+			.uli_u = {
+				.icmpt = {
+					.type = type,
+					.code = code
+				}
+			}
+		};
+		if (ip_route_output_key(&rt, &fl))
+			goto out_unlock;
+	}
+
+	if (!icmpv4_xrlim_allow(rt, type, code))
+		goto ende;
+
+	/* RFC says return as much as we can without exceeding 576 bytes. */
+
+	room = dst_mtu(&rt->u.dst);
+	if (room > 576)
+		room = 576;
+	room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen;
+	room -= sizeof(struct icmphdr);
+
+	icmp_param.data_len = skb_in->len - icmp_param.offset;
+	if (icmp_param.data_len > room)
+		icmp_param.data_len = room;
+	icmp_param.head_len = sizeof(struct icmphdr);
+
+	icmp_push_reply(&icmp_param, &ipc, rt);
+ende:
+	ip_rt_put(rt);
+out_unlock:
+	icmp_xmit_unlock();
+out:;
+}
+
+
+/*
+ *	Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH.
+ */
+
+static void icmp_unreach(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	struct icmphdr *icmph;
+	int hash, protocol;
+	struct net_protocol *ipprot;
+	struct sock *raw_sk;
+	u32 info = 0;
+
+	/*
+	 *	Incomplete header ?
+	 * 	Only checks for the IP header, there should be an
+	 *	additional check for longer headers in upper levels.
+	 */
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out_err;
+
+	icmph = skb->h.icmph;
+	iph   = (struct iphdr *)skb->data;
+
+	if (iph->ihl < 5) /* Mangled header, drop. */
+		goto out_err;
+
+	if (icmph->type == ICMP_DEST_UNREACH) {
+		switch (icmph->code & 15) {
+		case ICMP_NET_UNREACH:
+		case ICMP_HOST_UNREACH:
+		case ICMP_PROT_UNREACH:
+		case ICMP_PORT_UNREACH:
+			break;
+		case ICMP_FRAG_NEEDED:
+			if (ipv4_config.no_pmtu_disc) {
+				LIMIT_NETDEBUG(
+					printk(KERN_INFO "ICMP: %u.%u.%u.%u: "
+							 "fragmentation needed "
+							 "and DF set.\n",
+					       NIPQUAD(iph->daddr)));
+			} else {
+				info = ip_rt_frag_needed(iph,
+						     ntohs(icmph->un.frag.mtu));
+				if (!info)
+					goto out;
+			}
+			break;
+		case ICMP_SR_FAILED:
+			LIMIT_NETDEBUG(
+				printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source "
+						 "Route Failed.\n",
+				       NIPQUAD(iph->daddr)));
+			break;
+		default:
+			break;
+		}
+		if (icmph->code > NR_ICMP_UNREACH)
+			goto out;
+	} else if (icmph->type == ICMP_PARAMETERPROB)
+		info = ntohl(icmph->un.gateway) >> 24;
+
+	/*
+	 *	Throw it at our lower layers
+	 *
+	 *	RFC 1122: 3.2.2 MUST extract the protocol ID from the passed
+	 *		  header.
+	 *	RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the
+	 *		  transport layer.
+	 *	RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to
+	 *		  transport layer.
+	 */
+
+	/*
+	 *	Check the other end isnt violating RFC 1122. Some routers send
+	 *	bogus responses to broadcast frames. If you see this message
+	 *	first check your netmask matches at both ends, if it does then
+	 *	get the other vendor to fix their kit.
+	 */
+
+	if (!sysctl_icmp_ignore_bogus_error_responses &&
+	    inet_addr_type(iph->daddr) == RTN_BROADCAST) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP "
+					    "type %u, code %u "
+					    "error to a broadcast: %u.%u.%u.%u on %s\n",
+			       NIPQUAD(skb->nh.iph->saddr),
+			       icmph->type, icmph->code,
+			       NIPQUAD(iph->daddr),
+			       skb->dev->name);
+		goto out;
+	}
+
+	/* Checkin full IP header plus 8 bytes of protocol to
+	 * avoid additional coding at protocol handlers.
+	 */
+	if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+		goto out;
+
+	iph = (struct iphdr *)skb->data;
+	protocol = iph->protocol;
+
+	/*
+	 *	Deliver ICMP message to raw sockets. Pretty useless feature?
+	 */
+
+	/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+	hash = protocol & (MAX_INET_PROTOS - 1);
+	read_lock(&raw_v4_lock);
+	if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) {
+		while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr,
+						 iph->saddr,
+						 skb->dev->ifindex)) != NULL) {
+			raw_err(raw_sk, skb, info);
+			raw_sk = sk_next(raw_sk);
+			iph = (struct iphdr *)skb->data;
+		}
+	}
+	read_unlock(&raw_v4_lock);
+
+	rcu_read_lock();
+	ipprot = rcu_dereference(inet_protos[hash]);
+	if (ipprot && ipprot->err_handler)
+		ipprot->err_handler(skb, info);
+	rcu_read_unlock();
+
+out:
+	return;
+out_err:
+	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+	goto out;
+}
+
+
+/*
+ *	Handle ICMP_REDIRECT.
+ */
+
+static void icmp_redirect(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	unsigned long ip;
+
+	if (skb->len < sizeof(struct iphdr))
+		goto out_err;
+
+	/*
+	 *	Get the copied header of the packet that caused the redirect
+	 */
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	iph = (struct iphdr *)skb->data;
+	ip = iph->daddr;
+
+	switch (skb->h.icmph->code & 7) {
+	case ICMP_REDIR_NET:
+	case ICMP_REDIR_NETTOS:
+		/*
+		 * As per RFC recommendations now handle it as a host redirect.
+		 */
+	case ICMP_REDIR_HOST:
+	case ICMP_REDIR_HOSTTOS:
+		ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway,
+			       iph->saddr, iph->tos, skb->dev);
+		break;
+  	}
+out:
+	return;
+out_err:
+	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+	goto out;
+}
+
+/*
+ *	Handle ICMP_ECHO ("ping") requests.
+ *
+ *	RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo
+ *		  requests.
+ *	RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be
+ *		  included in the reply.
+ *	RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring
+ *		  echo requests, MUST have default=NOT.
+ *	See also WRT handling of options once they are done and working.
+ */
+
+static void icmp_echo(struct sk_buff *skb)
+{
+	if (!sysctl_icmp_echo_ignore_all) {
+		struct icmp_bxm icmp_param;
+
+		icmp_param.data.icmph	   = *skb->h.icmph;
+		icmp_param.data.icmph.type = ICMP_ECHOREPLY;
+		icmp_param.skb		   = skb;
+		icmp_param.offset	   = 0;
+		icmp_param.data_len	   = skb->len;
+		icmp_param.head_len	   = sizeof(struct icmphdr);
+		icmp_reply(&icmp_param, skb);
+	}
+}
+
+/*
+ *	Handle ICMP Timestamp requests.
+ *	RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests.
+ *		  SHOULD be in the kernel for minimum random latency.
+ *		  MUST be accurate to a few minutes.
+ *		  MUST be updated at least at 15Hz.
+ */
+static void icmp_timestamp(struct sk_buff *skb)
+{
+	struct timeval tv;
+	struct icmp_bxm icmp_param;
+	/*
+	 *	Too short.
+	 */
+	if (skb->len < 4)
+		goto out_err;
+
+	/*
+	 *	Fill in the current time as ms since midnight UT:
+	 */
+	do_gettimeofday(&tv);
+	icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 +
+					 tv.tv_usec / 1000);
+	icmp_param.data.times[2] = icmp_param.data.times[1];
+	if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
+		BUG();
+	icmp_param.data.icmph	   = *skb->h.icmph;
+	icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
+	icmp_param.data.icmph.code = 0;
+	icmp_param.skb		   = skb;
+	icmp_param.offset	   = 0;
+	icmp_param.data_len	   = 0;
+	icmp_param.head_len	   = sizeof(struct icmphdr) + 12;
+	icmp_reply(&icmp_param, skb);
+out:
+	return;
+out_err:
+	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+	goto out;
+}
+
+
+/*
+ *	Handle ICMP_ADDRESS_MASK requests.  (RFC950)
+ *
+ * RFC1122 (3.2.2.9).  A host MUST only send replies to
+ * ADDRESS_MASK requests if it's been configured as an address mask
+ * agent.  Receiving a request doesn't constitute implicit permission to
+ * act as one. Of course, implementing this correctly requires (SHOULD)
+ * a way to turn the functionality on and off.  Another one for sysctl(),
+ * I guess. -- MS
+ *
+ * RFC1812 (4.3.3.9).	A router MUST implement it.
+ *			A router SHOULD have switch turning it on/off.
+ *		      	This switch MUST be ON by default.
+ *
+ * Gratuitous replies, zero-source replies are not implemented,
+ * that complies with RFC. DO NOT implement them!!! All the idea
+ * of broadcast addrmask replies as specified in RFC950 is broken.
+ * The problem is that it is not uncommon to have several prefixes
+ * on one physical interface. Moreover, addrmask agent can even be
+ * not aware of existing another prefixes.
+ * If source is zero, addrmask agent cannot choose correct prefix.
+ * Gratuitous mask announcements suffer from the same problem.
+ * RFC1812 explains it, but still allows to use ADDRMASK,
+ * that is pretty silly. --ANK
+ *
+ * All these rules are so bizarre, that I removed kernel addrmask
+ * support at all. It is wrong, it is obsolete, nobody uses it in
+ * any case. --ANK
+ *
+ * Furthermore you can do it with a usermode address agent program
+ * anyway...
+ */
+
+static void icmp_address(struct sk_buff *skb)
+{
+#if 0
+	if (net_ratelimit())
+		printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n");
+#endif
+}
+
+/*
+ * RFC1812 (4.3.3.9).	A router SHOULD listen all replies, and complain
+ *			loudly if an inconsistency is found.
+ */
+
+static void icmp_address_reply(struct sk_buff *skb)
+{
+	struct rtable *rt = (struct rtable *)skb->dst;
+	struct net_device *dev = skb->dev;
+	struct in_device *in_dev;
+	struct in_ifaddr *ifa;
+
+	if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC))
+		goto out;
+
+	in_dev = in_dev_get(dev);
+	if (!in_dev)
+		goto out;
+	rcu_read_lock();
+	if (in_dev->ifa_list &&
+	    IN_DEV_LOG_MARTIANS(in_dev) &&
+	    IN_DEV_FORWARD(in_dev)) {
+		u32 _mask, *mp;
+
+		mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask);
+		if (mp == NULL)
+			BUG();
+		for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+			if (*mp == ifa->ifa_mask &&
+			    inet_ifa_match(rt->rt_src, ifa))
+				break;
+		}
+		if (!ifa && net_ratelimit()) {
+			printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from "
+					 "%s/%u.%u.%u.%u\n",
+			       NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src));
+		}
+	}
+	rcu_read_unlock();
+	in_dev_put(in_dev);
+out:;
+}
+
+static void icmp_discard(struct sk_buff *skb)
+{
+}
+
+/*
+ *	Deal with incoming ICMP packets.
+ */
+int icmp_rcv(struct sk_buff *skb)
+{
+	struct icmphdr *icmph;
+	struct rtable *rt = (struct rtable *)skb->dst;
+
+	ICMP_INC_STATS_BH(ICMP_MIB_INMSGS);
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_HW:
+		if (!(u16)csum_fold(skb->csum))
+			break;
+		NETDEBUG(if (net_ratelimit())
+				printk(KERN_DEBUG "icmp v4 hw csum failure\n"));
+	case CHECKSUM_NONE:
+		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))
+			goto error;
+	default:;
+	}
+
+	if (!pskb_pull(skb, sizeof(struct icmphdr)))
+		goto error;
+
+	icmph = skb->h.icmph;
+
+	/*
+	 *	18 is the highest 'known' ICMP type. Anything else is a mystery
+	 *
+	 *	RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+	 *		  discarded.
+	 */
+	if (icmph->type > NR_ICMP_TYPES)
+		goto error;
+
+
+	/*
+	 *	Parse the ICMP message
+	 */
+
+ 	if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+		/*
+		 *	RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be
+		 *	  silently ignored (we let user decide with a sysctl).
+		 *	RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently
+		 *	  discarded if to broadcast/multicast.
+		 */
+		if (icmph->type == ICMP_ECHO &&
+		    sysctl_icmp_echo_ignore_broadcasts) {
+			goto error;
+		}
+		if (icmph->type != ICMP_ECHO &&
+		    icmph->type != ICMP_TIMESTAMP &&
+		    icmph->type != ICMP_ADDRESS &&
+		    icmph->type != ICMP_ADDRESSREPLY) {
+			goto error;
+  		}
+	}
+
+	ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry);
+	icmp_pointers[icmph->type].handler(skb);
+
+drop:
+	kfree_skb(skb);
+	return 0;
+error:
+	ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+	goto drop;
+}
+
+/*
+ *	This table is the definition of how we handle ICMP.
+ */
+static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = {
+	[ICMP_ECHOREPLY] = {
+		.output_entry = ICMP_MIB_OUTECHOREPS,
+		.input_entry = ICMP_MIB_INECHOREPS,
+		.handler = icmp_discard,
+	},
+	[1] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_INERRORS,
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[2] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_INERRORS,
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_DEST_UNREACH] = {
+		.output_entry = ICMP_MIB_OUTDESTUNREACHS,
+		.input_entry = ICMP_MIB_INDESTUNREACHS,
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_SOURCE_QUENCH] = {
+		.output_entry = ICMP_MIB_OUTSRCQUENCHS,
+		.input_entry = ICMP_MIB_INSRCQUENCHS,
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_REDIRECT] = {
+		.output_entry = ICMP_MIB_OUTREDIRECTS,
+		.input_entry = ICMP_MIB_INREDIRECTS,
+		.handler = icmp_redirect,
+		.error = 1,
+	},
+	[6] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_INERRORS,
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[7] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_INERRORS,
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_ECHO] = {
+		.output_entry = ICMP_MIB_OUTECHOS,
+		.input_entry = ICMP_MIB_INECHOS,
+		.handler = icmp_echo,
+	},
+	[9] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_INERRORS,
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[10] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_INERRORS,
+		.handler = icmp_discard,
+		.error = 1,
+	},
+	[ICMP_TIME_EXCEEDED] = {
+		.output_entry = ICMP_MIB_OUTTIMEEXCDS,
+		.input_entry = ICMP_MIB_INTIMEEXCDS,
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_PARAMETERPROB] = {
+		.output_entry = ICMP_MIB_OUTPARMPROBS,
+		.input_entry = ICMP_MIB_INPARMPROBS,
+		.handler = icmp_unreach,
+		.error = 1,
+	},
+	[ICMP_TIMESTAMP] = {
+		.output_entry = ICMP_MIB_OUTTIMESTAMPS,
+		.input_entry = ICMP_MIB_INTIMESTAMPS,
+		.handler = icmp_timestamp,
+	},
+	[ICMP_TIMESTAMPREPLY] = {
+		.output_entry = ICMP_MIB_OUTTIMESTAMPREPS,
+		.input_entry = ICMP_MIB_INTIMESTAMPREPS,
+		.handler = icmp_discard,
+	},
+	[ICMP_INFO_REQUEST] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_DUMMY,
+		.handler = icmp_discard,
+	},
+ 	[ICMP_INFO_REPLY] = {
+		.output_entry = ICMP_MIB_DUMMY,
+		.input_entry = ICMP_MIB_DUMMY,
+		.handler = icmp_discard,
+	},
+	[ICMP_ADDRESS] = {
+		.output_entry = ICMP_MIB_OUTADDRMASKS,
+		.input_entry = ICMP_MIB_INADDRMASKS,
+		.handler = icmp_address,
+	},
+	[ICMP_ADDRESSREPLY] = {
+		.output_entry = ICMP_MIB_OUTADDRMASKREPS,
+		.input_entry = ICMP_MIB_INADDRMASKREPS,
+		.handler = icmp_address_reply,
+	},
+};
+
+void __init icmp_init(struct net_proto_family *ops)
+{
+	struct inet_sock *inet;
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		int err;
+
+		if (!cpu_possible(i))
+			continue;
+
+		err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP,
+				       &per_cpu(__icmp_socket, i));
+
+		if (err < 0)
+			panic("Failed to create the ICMP control socket.\n");
+
+		per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC;
+
+		/* Enough space for 2 64K ICMP packets, including
+		 * sk_buff struct overhead.
+		 */
+		per_cpu(__icmp_socket, i)->sk->sk_sndbuf =
+			(2 * ((64 * 1024) + sizeof(struct sk_buff)));
+
+		inet = inet_sk(per_cpu(__icmp_socket, i)->sk);
+		inet->uc_ttl = -1;
+		inet->pmtudisc = IP_PMTUDISC_DONT;
+
+		/* Unhash it so that IP input processing does not even
+		 * see it, we do not wish this socket to see incoming
+		 * packets.
+		 */
+		per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk);
+	}
+}
+
+EXPORT_SYMBOL(icmp_err_convert);
+EXPORT_SYMBOL(icmp_send);
+EXPORT_SYMBOL(icmp_statistics);
+EXPORT_SYMBOL(xrlim_allow);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
new file mode 100644
index 000000000000..1f3183168a90
--- /dev/null
+++ b/net/ipv4/igmp.c
@@ -0,0 +1,2473 @@
+/*
+ *	Linux NET3:	Internet Group Management Protocol  [IGMP]
+ *
+ *	This code implements the IGMP protocol as defined in RFC1112. There has
+ *	been a further revision of this protocol since which is now supported.
+ *
+ *	If you have trouble with this module be careful what gcc you have used,
+ *	the older version didn't come out right using gcc 2.5.8, the newer one
+ *	seems to fall out with gcc 2.6.2.
+ *
+ *	Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $
+ *
+ *	Authors:
+ *		Alan Cox <Alan.Cox@linux.org>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Fixes:
+ *
+ *		Alan Cox	:	Added lots of __inline__ to optimise
+ *					the memory usage of all the tiny little
+ *					functions.
+ *		Alan Cox	:	Dumped the header building experiment.
+ *		Alan Cox	:	Minor tweaks ready for multicast routing
+ *					and extended IGMP protocol.
+ *		Alan Cox	:	Removed a load of inline directives. Gcc 2.5.8
+ *					writes utterly bogus code otherwise (sigh)
+ *					fixed IGMP loopback to behave in the manner
+ *					desired by mrouted, fixed the fact it has been
+ *					broken since 1.3.6 and cleaned up a few minor
+ *					points.
+ *
+ *		Chih-Jen Chang	:	Tried to revise IGMP to Version 2
+ *		Tsu-Sheng Tsao		E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu
+ *					The enhancements are mainly based on Steve Deering's 
+ * 					ipmulti-3.5 source code.
+ *		Chih-Jen Chang	:	Added the igmp_get_mrouter_info and
+ *		Tsu-Sheng Tsao		igmp_set_mrouter_info to keep track of
+ *					the mrouted version on that device.
+ *		Chih-Jen Chang	:	Added the max_resp_time parameter to
+ *		Tsu-Sheng Tsao		igmp_heard_query(). Using this parameter
+ *					to identify the multicast router version
+ *					and do what the IGMP version 2 specified.
+ *		Chih-Jen Chang	:	Added a timer to revert to IGMP V2 router
+ *		Tsu-Sheng Tsao		if the specified time expired.
+ *		Alan Cox	:	Stop IGMP from 0.0.0.0 being accepted.
+ *		Alan Cox	:	Use GFP_ATOMIC in the right places.
+ *		Christian Daudt :	igmp timer wasn't set for local group
+ *					memberships but was being deleted, 
+ *					which caused a "del_timer() called 
+ *					from %p with timer not initialized\n"
+ *					message (960131).
+ *		Christian Daudt :	removed del_timer from 
+ *					igmp_timer_expire function (960205).
+ *             Christian Daudt :       igmp_heard_report now only calls
+ *                                     igmp_timer_expire if tm->running is
+ *                                     true (960216).
+ *		Malcolm Beattie :	ttl comparison wrong in igmp_rcv made
+ *					igmp_heard_query never trigger. Expiry
+ *					miscalculation fixed in igmp_heard_query
+ *					and random() made to return unsigned to
+ *					prevent negative expiry times.
+ *		Alexey Kuznetsov:	Wrong group leaving behaviour, backport
+ *					fix from pending 2.1.x patches.
+ *		Alan Cox:		Forget to enable FDDI support earlier.
+ *		Alexey Kuznetsov:	Fixed leaving groups on device down.
+ *		Alexey Kuznetsov:	Accordance to igmp-v2-06 draft.
+ *		David L Stevens:	IGMPv3 support, with help from
+ *					Vinay Kulkarni
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/if_arp.h>
+#include <linux/rtnetlink.h>
+#include <linux/times.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/sock.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#ifdef CONFIG_IP_MROUTE
+#include <linux/mroute.h>
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#endif
+
+#define IP_MAX_MEMBERSHIPS	20
+#define IP_MAX_MSF		10
+
+#ifdef CONFIG_IP_MULTICAST
+/* Parameter names and values are taken from igmp-v2-06 draft */
+
+#define IGMP_V1_Router_Present_Timeout		(400*HZ)
+#define IGMP_V2_Router_Present_Timeout		(400*HZ)
+#define IGMP_Unsolicited_Report_Interval	(10*HZ)
+#define IGMP_Query_Response_Interval		(10*HZ)
+#define IGMP_Unsolicited_Report_Count		2
+
+
+#define IGMP_Initial_Report_Delay		(1)
+
+/* IGMP_Initial_Report_Delay is not from IGMP specs!
+ * IGMP specs require to report membership immediately after
+ * joining a group, but we delay the first report by a
+ * small interval. It seems more natural and still does not
+ * contradict to specs provided this delay is small enough.
+ */
+
+#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \
+		(in_dev)->cnf.force_igmp_version == 1 || \
+		((in_dev)->mr_v1_seen && \
+		time_before(jiffies, (in_dev)->mr_v1_seen)))
+#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \
+		(in_dev)->cnf.force_igmp_version == 2 || \
+		((in_dev)->mr_v2_seen && \
+		time_before(jiffies, (in_dev)->mr_v2_seen)))
+
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im);
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr);
+static void igmpv3_clear_delrec(struct in_device *in_dev);
+static int sf_setstate(struct ip_mc_list *pmc);
+static void sf_markstate(struct ip_mc_list *pmc);
+#endif
+static void ip_mc_clear_src(struct ip_mc_list *pmc);
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+			 int sfcount, __u32 *psfsrc, int delta);
+
+static void ip_ma_put(struct ip_mc_list *im)
+{
+	if (atomic_dec_and_test(&im->refcnt)) {
+		in_dev_put(im->interface);
+		kfree(im);
+	}
+}
+
+#ifdef CONFIG_IP_MULTICAST
+
+/*
+ *	Timer management
+ */
+
+static __inline__ void igmp_stop_timer(struct ip_mc_list *im)
+{
+	spin_lock_bh(&im->lock);
+	if (del_timer(&im->timer))
+		atomic_dec(&im->refcnt);
+	im->tm_running=0;
+	im->reporter = 0;
+	im->unsolicit_count = 0;
+	spin_unlock_bh(&im->lock);
+}
+
+/* It must be called with locked im->lock */
+static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
+{
+	int tv=net_random() % max_delay;
+
+	im->tm_running=1;
+	if (!mod_timer(&im->timer, jiffies+tv+2))
+		atomic_inc(&im->refcnt);
+}
+
+static void igmp_gq_start_timer(struct in_device *in_dev)
+{
+	int tv = net_random() % in_dev->mr_maxdelay;
+
+	in_dev->mr_gq_running = 1;
+	if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2))
+		in_dev_hold(in_dev);
+}
+
+static void igmp_ifc_start_timer(struct in_device *in_dev, int delay)
+{
+	int tv = net_random() % delay;
+
+	if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2))
+		in_dev_hold(in_dev);
+}
+
+static void igmp_mod_timer(struct ip_mc_list *im, int max_delay)
+{
+	spin_lock_bh(&im->lock);
+	im->unsolicit_count = 0;
+	if (del_timer(&im->timer)) {
+		if ((long)(im->timer.expires-jiffies) < max_delay) {
+			add_timer(&im->timer);
+			im->tm_running=1;
+			spin_unlock_bh(&im->lock);
+			return;
+		}
+		atomic_dec(&im->refcnt);
+	}
+	igmp_start_timer(im, max_delay);
+	spin_unlock_bh(&im->lock);
+}
+
+
+/*
+ *	Send an IGMP report.
+ */
+
+#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4)
+
+
+static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type,
+	int gdeleted, int sdeleted)
+{
+	switch (type) {
+	case IGMPV3_MODE_IS_INCLUDE:
+	case IGMPV3_MODE_IS_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		return !(pmc->gsquery && !psf->sf_gsresp);
+	case IGMPV3_CHANGE_TO_INCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		return psf->sf_count[MCAST_INCLUDE] != 0;
+	case IGMPV3_CHANGE_TO_EXCLUDE:
+		if (gdeleted || sdeleted)
+			return 0;
+		if (pmc->sfcount[MCAST_EXCLUDE] == 0 ||
+		    psf->sf_count[MCAST_INCLUDE])
+			return 0;
+		return pmc->sfcount[MCAST_EXCLUDE] ==
+			psf->sf_count[MCAST_EXCLUDE];
+	case IGMPV3_ALLOW_NEW_SOURCES:
+		if (gdeleted || !psf->sf_crcount)
+			return 0;
+		return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted;
+	case IGMPV3_BLOCK_OLD_SOURCES:
+		if (pmc->sfmode == MCAST_INCLUDE)
+			return gdeleted || (psf->sf_crcount && sdeleted);
+		return psf->sf_crcount && !gdeleted && !sdeleted;
+	}
+	return 0;
+}
+
+static int
+igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
+{
+	struct ip_sf_list *psf;
+	int scount = 0;
+
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted))
+			continue;
+		scount++;
+	}
+	return scount;
+}
+
+static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size)
+{
+	struct sk_buff *skb;
+	struct rtable *rt;
+	struct iphdr *pip;
+	struct igmpv3_report *pig;
+
+	skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	{
+		struct flowi fl = { .oif = dev->ifindex,
+				    .nl_u = { .ip4_u = {
+				    .daddr = IGMPV3_ALL_MCR } },
+				    .proto = IPPROTO_IGMP };
+		if (ip_route_output_key(&rt, &fl)) {
+			kfree_skb(skb);
+			return NULL;
+		}
+	}
+	if (rt->rt_src == 0) {
+		kfree_skb(skb);
+		ip_rt_put(rt);
+		return NULL;
+	}
+
+	skb->dst = &rt->u.dst;
+	skb->dev = dev;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+	pip->version  = 4;
+	pip->ihl      = (sizeof(struct iphdr)+4)>>2;
+	pip->tos      = 0xc0;
+	pip->frag_off = htons(IP_DF);
+	pip->ttl      = 1;
+	pip->daddr    = rt->rt_dst;
+	pip->saddr    = rt->rt_src;
+	pip->protocol = IPPROTO_IGMP;
+	pip->tot_len  = 0;	/* filled in later */
+	ip_select_ident(pip, &rt->u.dst, NULL);
+	((u8*)&pip[1])[0] = IPOPT_RA;
+	((u8*)&pip[1])[1] = 4;
+	((u8*)&pip[1])[2] = 0;
+	((u8*)&pip[1])[3] = 0;
+
+	pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig));
+	skb->h.igmph = (struct igmphdr *)pig;
+	pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT;
+	pig->resv1 = 0;
+	pig->csum = 0;
+	pig->resv2 = 0;
+	pig->ngrec = 0;
+	return skb;
+}
+
+static int igmpv3_sendpack(struct sk_buff *skb)
+{
+	struct iphdr *pip = skb->nh.iph;
+	struct igmphdr *pig = skb->h.igmph;
+	int iplen, igmplen;
+
+	iplen = skb->tail - (unsigned char *)skb->nh.iph;
+	pip->tot_len = htons(iplen);
+	ip_send_check(pip);
+
+	igmplen = skb->tail - (unsigned char *)skb->h.igmph;
+	pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen);
+
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev,
+		       dst_output);
+}
+
+static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
+{
+	return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel);
+}
+
+static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+	int type, struct igmpv3_grec **ppgr)
+{
+	struct net_device *dev = pmc->interface->dev;
+	struct igmpv3_report *pih;
+	struct igmpv3_grec *pgr;
+
+	if (!skb)
+		skb = igmpv3_newpack(dev, dev->mtu);
+	if (!skb)
+		return NULL;
+	pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
+	pgr->grec_type = type;
+	pgr->grec_auxwords = 0;
+	pgr->grec_nsrcs = 0;
+	pgr->grec_mca = pmc->multiaddr;
+	pih = (struct igmpv3_report *)skb->h.igmph;
+	pih->ngrec = htons(ntohs(pih->ngrec)+1);
+	*ppgr = pgr;
+	return skb;
+}
+
+#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \
+	skb_tailroom(skb)) : 0)
+
+static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
+	int type, int gdeleted, int sdeleted)
+{
+	struct net_device *dev = pmc->interface->dev;
+	struct igmpv3_report *pih;
+	struct igmpv3_grec *pgr = NULL;
+	struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+	int scount, first, isquery, truncate;
+
+	if (pmc->multiaddr == IGMP_ALL_HOSTS)
+		return skb;
+
+	isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+		  type == IGMPV3_MODE_IS_EXCLUDE;
+	truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+		    type == IGMPV3_CHANGE_TO_EXCLUDE;
+
+	psf_list = sdeleted ? &pmc->tomb : &pmc->sources;
+
+	if (!*psf_list) {
+		if (type == IGMPV3_ALLOW_NEW_SOURCES ||
+		    type == IGMPV3_BLOCK_OLD_SOURCES)
+			return skb;
+		if (pmc->crcount || isquery) {
+			/* make sure we have room for group header and at
+			 * least one source.
+			 */
+			if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+
+			    sizeof(__u32)) {
+				igmpv3_sendpack(skb);
+				skb = NULL; /* add_grhead will get a new one */
+			}
+			skb = add_grhead(skb, pmc, type, &pgr);
+		}
+		return skb;
+	}
+	pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL;
+
+	/* EX and TO_EX get a fresh packet, if needed */
+	if (truncate) {
+		if (pih && pih->ngrec &&
+		    AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+			if (skb)
+				igmpv3_sendpack(skb);
+			skb = igmpv3_newpack(dev, dev->mtu);
+		}
+	}
+	first = 1;
+	scount = 0;
+	psf_prev = NULL;
+	for (psf=*psf_list; psf; psf=psf_next) {
+		u32 *psrc;
+
+		psf_next = psf->sf_next;
+
+		if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+			psf_prev = psf;
+			continue;
+		}
+
+		/* clear marks on query responses */
+		if (isquery)
+			psf->sf_gsresp = 0;
+
+		if (AVAILABLE(skb) < sizeof(u32) +
+		    first*sizeof(struct igmpv3_grec)) {
+			if (truncate && !first)
+				break;	 /* truncate these */
+			if (pgr)
+				pgr->grec_nsrcs = htons(scount);
+			if (skb)
+				igmpv3_sendpack(skb);
+			skb = igmpv3_newpack(dev, dev->mtu);
+			first = 1;
+			scount = 0;
+		}
+		if (first) {
+			skb = add_grhead(skb, pmc, type, &pgr);
+			first = 0;
+		}
+		psrc = (u32 *)skb_put(skb, sizeof(u32));
+		*psrc = psf->sf_inaddr;
+		scount++;
+		if ((type == IGMPV3_ALLOW_NEW_SOURCES ||
+		     type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) {
+			psf->sf_crcount--;
+			if ((sdeleted || gdeleted) && psf->sf_crcount == 0) {
+				if (psf_prev)
+					psf_prev->sf_next = psf->sf_next;
+				else
+					*psf_list = psf->sf_next;
+				kfree(psf);
+				continue;
+			}
+		}
+		psf_prev = psf;
+	}
+	if (pgr)
+		pgr->grec_nsrcs = htons(scount);
+
+	if (isquery)
+		pmc->gsquery = 0;	/* clear query state on report */
+	return skb;
+}
+
+static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
+{
+	struct sk_buff *skb = NULL;
+	int type;
+
+	if (!pmc) {
+		read_lock(&in_dev->mc_list_lock);
+		for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+			if (pmc->multiaddr == IGMP_ALL_HOSTS)
+				continue;
+			spin_lock_bh(&pmc->lock);
+			if (pmc->sfcount[MCAST_EXCLUDE])
+				type = IGMPV3_MODE_IS_EXCLUDE;
+			else
+				type = IGMPV3_MODE_IS_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+			spin_unlock_bh(&pmc->lock);
+		}
+		read_unlock(&in_dev->mc_list_lock);
+	} else {
+		spin_lock_bh(&pmc->lock);
+		if (pmc->sfcount[MCAST_EXCLUDE])
+			type = IGMPV3_MODE_IS_EXCLUDE;
+		else
+			type = IGMPV3_MODE_IS_INCLUDE;
+		skb = add_grec(skb, pmc, type, 0, 0);
+		spin_unlock_bh(&pmc->lock);
+	}
+	if (!skb)
+		return 0;
+	return igmpv3_sendpack(skb);
+}
+
+/*
+ * remove zero-count source records from a source filter list
+ */
+static void igmpv3_clear_zeros(struct ip_sf_list **ppsf)
+{
+	struct ip_sf_list *psf_prev, *psf_next, *psf;
+
+	psf_prev = NULL;
+	for (psf=*ppsf; psf; psf = psf_next) {
+		psf_next = psf->sf_next;
+		if (psf->sf_crcount == 0) {
+			if (psf_prev)
+				psf_prev->sf_next = psf->sf_next;
+			else
+				*ppsf = psf->sf_next;
+			kfree(psf);
+		} else
+			psf_prev = psf;
+	}
+}
+
+static void igmpv3_send_cr(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc, *pmc_prev, *pmc_next;
+	struct sk_buff *skb = NULL;
+	int type, dtype;
+
+	read_lock(&in_dev->mc_list_lock);
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+
+	/* deleted MCA's */
+	pmc_prev = NULL;
+	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) {
+		pmc_next = pmc->next;
+		if (pmc->sfmode == MCAST_INCLUDE) {
+			type = IGMPV3_BLOCK_OLD_SOURCES;
+			dtype = IGMPV3_BLOCK_OLD_SOURCES;
+			skb = add_grec(skb, pmc, type, 1, 0);
+			skb = add_grec(skb, pmc, dtype, 1, 1);
+		}
+		if (pmc->crcount) {
+			pmc->crcount--;
+			if (pmc->sfmode == MCAST_EXCLUDE) {
+				type = IGMPV3_CHANGE_TO_INCLUDE;
+				skb = add_grec(skb, pmc, type, 1, 0);
+			}
+			if (pmc->crcount == 0) {
+				igmpv3_clear_zeros(&pmc->tomb);
+				igmpv3_clear_zeros(&pmc->sources);
+			}
+		}
+		if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) {
+			if (pmc_prev)
+				pmc_prev->next = pmc_next;
+			else
+				in_dev->mc_tomb = pmc_next;
+			in_dev_put(pmc->interface);
+			kfree(pmc);
+		} else
+			pmc_prev = pmc;
+	}
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	/* change recs */
+	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+		spin_lock_bh(&pmc->lock);
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			type = IGMPV3_BLOCK_OLD_SOURCES;
+			dtype = IGMPV3_ALLOW_NEW_SOURCES;
+		} else {
+			type = IGMPV3_ALLOW_NEW_SOURCES;
+			dtype = IGMPV3_BLOCK_OLD_SOURCES;
+		}
+		skb = add_grec(skb, pmc, type, 0, 0);
+		skb = add_grec(skb, pmc, dtype, 0, 1);	/* deleted sources */
+
+		/* filter mode changes */
+		if (pmc->crcount) {
+			pmc->crcount--;
+			if (pmc->sfmode == MCAST_EXCLUDE)
+				type = IGMPV3_CHANGE_TO_EXCLUDE;
+			else
+				type = IGMPV3_CHANGE_TO_INCLUDE;
+			skb = add_grec(skb, pmc, type, 0, 0);
+		}
+		spin_unlock_bh(&pmc->lock);
+	}
+	read_unlock(&in_dev->mc_list_lock);
+
+	if (!skb)
+		return;
+	(void) igmpv3_sendpack(skb);
+}
+
+static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
+	int type)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct igmphdr *ih;
+	struct rtable *rt;
+	struct net_device *dev = in_dev->dev;
+	u32	group = pmc ? pmc->multiaddr : 0;
+	u32	dst;
+
+	if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
+		return igmpv3_send_report(in_dev, pmc);
+	else if (type == IGMP_HOST_LEAVE_MESSAGE)
+		dst = IGMP_ALL_ROUTER;
+	else
+		dst = group;
+
+	{
+		struct flowi fl = { .oif = dev->ifindex,
+				    .nl_u = { .ip4_u = { .daddr = dst } },
+				    .proto = IPPROTO_IGMP };
+		if (ip_route_output_key(&rt, &fl))
+			return -1;
+	}
+	if (rt->rt_src == 0) {
+		ip_rt_put(rt);
+		return -1;
+	}
+
+	skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC);
+	if (skb == NULL) {
+		ip_rt_put(rt);
+		return -1;
+	}
+
+	skb->dst = &rt->u.dst;
+
+	skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+	skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4);
+
+	iph->version  = 4;
+	iph->ihl      = (sizeof(struct iphdr)+4)>>2;
+	iph->tos      = 0xc0;
+	iph->frag_off = htons(IP_DF);
+	iph->ttl      = 1;
+	iph->daddr    = dst;
+	iph->saddr    = rt->rt_src;
+	iph->protocol = IPPROTO_IGMP;
+	iph->tot_len  = htons(IGMP_SIZE);
+	ip_select_ident(iph, &rt->u.dst, NULL);
+	((u8*)&iph[1])[0] = IPOPT_RA;
+	((u8*)&iph[1])[1] = 4;
+	((u8*)&iph[1])[2] = 0;
+	((u8*)&iph[1])[3] = 0;
+	ip_send_check(iph);
+
+	ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+	ih->type=type;
+	ih->code=0;
+	ih->csum=0;
+	ih->group=group;
+	ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr));
+
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		       dst_output);
+}
+
+static void igmp_gq_timer_expire(unsigned long data)
+{
+	struct in_device *in_dev = (struct in_device *)data;
+
+	in_dev->mr_gq_running = 0;
+	igmpv3_send_report(in_dev, NULL);
+	__in_dev_put(in_dev);
+}
+
+static void igmp_ifc_timer_expire(unsigned long data)
+{
+	struct in_device *in_dev = (struct in_device *)data;
+
+	igmpv3_send_cr(in_dev);
+	if (in_dev->mr_ifc_count) {
+		in_dev->mr_ifc_count--;
+		igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval);
+	}
+	__in_dev_put(in_dev);
+}
+
+static void igmp_ifc_event(struct in_device *in_dev)
+{
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
+		return;
+	in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : 
+		IGMP_Unsolicited_Report_Count;
+	igmp_ifc_start_timer(in_dev, 1);
+}
+
+
+static void igmp_timer_expire(unsigned long data)
+{
+	struct ip_mc_list *im=(struct ip_mc_list *)data;
+	struct in_device *in_dev = im->interface;
+
+	spin_lock(&im->lock);
+	im->tm_running=0;
+
+	if (im->unsolicit_count) {
+		im->unsolicit_count--;
+		igmp_start_timer(im, IGMP_Unsolicited_Report_Interval);
+	}
+	im->reporter = 1;
+	spin_unlock(&im->lock);
+
+	if (IGMP_V1_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+	else if (IGMP_V2_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+	else
+		igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
+
+	ip_ma_put(im);
+}
+
+static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs)
+{
+	struct ip_sf_list *psf;
+	int i, scount;
+
+	scount = 0;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (scount == nsrcs)
+			break;
+		for (i=0; i<nsrcs; i++)
+			if (srcs[i] == psf->sf_inaddr) {
+				psf->sf_gsresp = 1;
+				scount++;
+				break;
+			}
+	}
+}
+
+static void igmp_heard_report(struct in_device *in_dev, u32 group)
+{
+	struct ip_mc_list *im;
+
+	/* Timers are only set for non-local groups */
+
+	if (group == IGMP_ALL_HOSTS)
+		return;
+
+	read_lock(&in_dev->mc_list_lock);
+	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+		if (im->multiaddr == group) {
+			igmp_stop_timer(im);
+			break;
+		}
+	}
+	read_unlock(&in_dev->mc_list_lock);
+}
+
+static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
+	int len)
+{
+	struct igmphdr 		*ih = skb->h.igmph;
+	struct igmpv3_query *ih3 = (struct igmpv3_query *)ih;
+	struct ip_mc_list	*im;
+	u32			group = ih->group;
+	int			max_delay;
+	int			mark = 0;
+
+
+	if (len == 8) {
+		if (ih->code == 0) {
+			/* Alas, old v1 router presents here. */
+	
+			max_delay = IGMP_Query_Response_Interval;
+			in_dev->mr_v1_seen = jiffies +
+				IGMP_V1_Router_Present_Timeout;
+			group = 0;
+		} else {
+			/* v2 router present */
+			max_delay = ih->code*(HZ/IGMP_TIMER_SCALE);
+			in_dev->mr_v2_seen = jiffies +
+				IGMP_V2_Router_Present_Timeout;
+		}
+		/* cancel the interface change timer */
+		in_dev->mr_ifc_count = 0;
+		if (del_timer(&in_dev->mr_ifc_timer))
+			__in_dev_put(in_dev);
+		/* clear deleted report items */
+		igmpv3_clear_delrec(in_dev);
+	} else if (len < 12) {
+		return;	/* ignore bogus packet; freed by caller */
+	} else { /* v3 */
+		if (!pskb_may_pull(skb, sizeof(struct igmpv3_query)))
+			return;
+		
+		ih3 = (struct igmpv3_query *) skb->h.raw;
+		if (ih3->nsrcs) {
+			if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) 
+					   + ntohs(ih3->nsrcs)*sizeof(__u32)))
+				return;
+			ih3 = (struct igmpv3_query *) skb->h.raw;
+		}
+
+		max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
+		if (!max_delay)
+			max_delay = 1;	/* can't mod w/ 0 */
+		in_dev->mr_maxdelay = max_delay;
+		if (ih3->qrv)
+			in_dev->mr_qrv = ih3->qrv;
+		if (!group) { /* general query */
+			if (ih3->nsrcs)
+				return;	/* no sources allowed */
+			igmp_gq_start_timer(in_dev);
+			return;
+		}
+		/* mark sources to include, if group & source-specific */
+		mark = ih3->nsrcs != 0;
+	}
+
+	/*
+	 * - Start the timers in all of our membership records
+	 *   that the query applies to for the interface on
+	 *   which the query arrived excl. those that belong
+	 *   to a "local" group (224.0.0.X)
+	 * - For timers already running check if they need to
+	 *   be reset.
+	 * - Use the igmp->igmp_code field as the maximum
+	 *   delay possible
+	 */
+	read_lock(&in_dev->mc_list_lock);
+	for (im=in_dev->mc_list; im!=NULL; im=im->next) {
+		if (group && group != im->multiaddr)
+			continue;
+		if (im->multiaddr == IGMP_ALL_HOSTS)
+			continue;
+		spin_lock_bh(&im->lock);
+		if (im->tm_running)
+			im->gsquery = im->gsquery && mark;
+		else
+			im->gsquery = mark;
+		if (im->gsquery)
+			igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs);
+		spin_unlock_bh(&im->lock);
+		igmp_mod_timer(im, max_delay);
+	}
+	read_unlock(&in_dev->mc_list_lock);
+}
+
+int igmp_rcv(struct sk_buff *skb)
+{
+	/* This basically follows the spec line by line -- see RFC1112 */
+	struct igmphdr *ih;
+	struct in_device *in_dev = in_dev_get(skb->dev);
+	int len = skb->len;
+
+	if (in_dev==NULL) {
+		kfree_skb(skb);
+		return 0;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || 
+	    (u16)csum_fold(skb_checksum(skb, 0, len, 0))) {
+		in_dev_put(in_dev);
+		kfree_skb(skb);
+		return 0;
+	}
+
+	ih = skb->h.igmph;
+	switch (ih->type) {
+	case IGMP_HOST_MEMBERSHIP_QUERY:
+		igmp_heard_query(in_dev, skb, len);
+		break;
+	case IGMP_HOST_MEMBERSHIP_REPORT:
+	case IGMPV2_HOST_MEMBERSHIP_REPORT:
+	case IGMPV3_HOST_MEMBERSHIP_REPORT:
+		/* Is it our report looped back? */
+		if (((struct rtable*)skb->dst)->fl.iif == 0)
+			break;
+		igmp_heard_report(in_dev, ih->group);
+		break;
+	case IGMP_PIM:
+#ifdef CONFIG_IP_PIMSM_V1
+		in_dev_put(in_dev);
+		return pim_rcv_v1(skb);
+#endif
+	case IGMP_DVMRP:
+	case IGMP_TRACE:
+	case IGMP_HOST_LEAVE_MESSAGE:
+	case IGMP_MTRACE:
+	case IGMP_MTRACE_RESP:
+		break;
+	default:
+		NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type));
+	}
+	in_dev_put(in_dev);
+	kfree_skb(skb);
+	return 0;
+}
+
+#endif
+
+
+/*
+ *	Add a filter to a device
+ */
+
+static void ip_mc_filter_add(struct in_device *in_dev, u32 addr)
+{
+	char buf[MAX_ADDR_LEN];
+	struct net_device *dev = in_dev->dev;
+
+	/* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG.
+	   We will get multicast token leakage, when IFF_MULTICAST
+	   is changed. This check should be done in dev->set_multicast_list
+	   routine. Something sort of:
+	   if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; }
+	   --ANK
+	   */
+	if (arp_mc_map(addr, buf, dev, 0) == 0)
+		dev_mc_add(dev,buf,dev->addr_len,0);
+}
+
+/*
+ *	Remove a filter from a device
+ */
+
+static void ip_mc_filter_del(struct in_device *in_dev, u32 addr)
+{
+	char buf[MAX_ADDR_LEN];
+	struct net_device *dev = in_dev->dev;
+
+	if (arp_mc_map(addr, buf, dev, 0) == 0)
+		dev_mc_delete(dev,buf,dev->addr_len,0);
+}
+
+#ifdef CONFIG_IP_MULTICAST
+/*
+ * deleted ip_mc_list manipulation
+ */
+static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
+{
+	struct ip_mc_list *pmc;
+
+	/* this is an "ip_mc_list" for convenience; only the fields below
+	 * are actually used. In particular, the refcnt and users are not
+	 * used for management of the delete list. Using the same structure
+	 * for deleted items allows change reports to use common code with
+	 * non-deleted or query-response MCA's.
+	 */
+	pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL);
+	if (!pmc)
+		return;
+	memset(pmc, 0, sizeof(*pmc));
+	spin_lock_bh(&im->lock);
+	pmc->interface = im->interface;
+	in_dev_hold(in_dev);
+	pmc->multiaddr = im->multiaddr;
+	pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+		IGMP_Unsolicited_Report_Count;
+	pmc->sfmode = im->sfmode;
+	if (pmc->sfmode == MCAST_INCLUDE) {
+		struct ip_sf_list *psf;
+
+		pmc->tomb = im->tomb;
+		pmc->sources = im->sources;
+		im->tomb = im->sources = NULL;
+		for (psf=pmc->sources; psf; psf=psf->sf_next)
+			psf->sf_crcount = pmc->crcount;
+	}
+	spin_unlock_bh(&im->lock);
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc->next = in_dev->mc_tomb;
+	in_dev->mc_tomb = pmc;
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+}
+
+static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr)
+{
+	struct ip_mc_list *pmc, *pmc_prev;
+	struct ip_sf_list *psf, *psf_next;
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc_prev = NULL;
+	for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) {
+		if (pmc->multiaddr == multiaddr)
+			break;
+		pmc_prev = pmc;
+	}
+	if (pmc) {
+		if (pmc_prev)
+			pmc_prev->next = pmc->next;
+		else
+			in_dev->mc_tomb = pmc->next;
+	}
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+	if (pmc) {
+		for (psf=pmc->tomb; psf; psf=psf_next) {
+			psf_next = psf->sf_next;
+			kfree(psf);
+		}
+		in_dev_put(pmc->interface);
+		kfree(pmc);
+	}
+}
+
+static void igmpv3_clear_delrec(struct in_device *in_dev)
+{
+	struct ip_mc_list *pmc, *nextpmc;
+
+	spin_lock_bh(&in_dev->mc_tomb_lock);
+	pmc = in_dev->mc_tomb;
+	in_dev->mc_tomb = NULL;
+	spin_unlock_bh(&in_dev->mc_tomb_lock);
+
+	for (; pmc; pmc = nextpmc) {
+		nextpmc = pmc->next;
+		ip_mc_clear_src(pmc);
+		in_dev_put(pmc->interface);
+		kfree(pmc);
+	}
+	/* clear dead sources, too */
+	read_lock(&in_dev->mc_list_lock);
+	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+		struct ip_sf_list *psf, *psf_next;
+
+		spin_lock_bh(&pmc->lock);
+		psf = pmc->tomb;
+		pmc->tomb = NULL;
+		spin_unlock_bh(&pmc->lock);
+		for (; psf; psf=psf_next) {
+			psf_next = psf->sf_next;
+			kfree(psf);
+		}
+	}
+	read_unlock(&in_dev->mc_list_lock);
+}
+#endif
+
+static void igmp_group_dropped(struct ip_mc_list *im)
+{
+	struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+	int reporter;
+#endif
+
+	if (im->loaded) {
+		im->loaded = 0;
+		ip_mc_filter_del(in_dev, im->multiaddr);
+	}
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+
+	reporter = im->reporter;
+	igmp_stop_timer(im);
+
+	if (!in_dev->dead) {
+		if (IGMP_V1_SEEN(in_dev))
+			goto done;
+		if (IGMP_V2_SEEN(in_dev)) {
+			if (reporter)
+				igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE);
+			goto done;
+		}
+		/* IGMPv3 */
+		igmpv3_add_delrec(in_dev, im);
+
+		igmp_ifc_event(in_dev);
+	}
+done:
+#endif
+	ip_mc_clear_src(im);
+}
+
+static void igmp_group_added(struct ip_mc_list *im)
+{
+	struct in_device *in_dev = im->interface;
+
+	if (im->loaded == 0) {
+		im->loaded = 1;
+		ip_mc_filter_add(in_dev, im->multiaddr);
+	}
+
+#ifdef CONFIG_IP_MULTICAST
+	if (im->multiaddr == IGMP_ALL_HOSTS)
+		return;
+
+	if (in_dev->dead)
+		return;
+	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
+		spin_lock_bh(&im->lock);
+		igmp_start_timer(im, IGMP_Initial_Report_Delay);
+		spin_unlock_bh(&im->lock);
+		return;
+	}
+	/* else, v3 */
+
+	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
+		IGMP_Unsolicited_Report_Count;
+	igmp_ifc_event(in_dev);
+#endif
+}
+
+
+/*
+ *	Multicast list managers
+ */
+
+
+/*
+ *	A socket has joined a multicast group on device dev.
+ */
+
+void ip_mc_inc_group(struct in_device *in_dev, u32 addr)
+{
+	struct ip_mc_list *im;
+
+	ASSERT_RTNL();
+
+	for (im=in_dev->mc_list; im; im=im->next) {
+		if (im->multiaddr == addr) {
+			im->users++;
+			ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+			goto out;
+		}
+	}
+
+	im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL);
+	if (!im)
+		goto out;
+
+	im->users=1;
+	im->interface=in_dev;
+	in_dev_hold(in_dev);
+	im->multiaddr=addr;
+	/* initial mode is (EX, empty) */
+	im->sfmode = MCAST_EXCLUDE;
+	im->sfcount[MCAST_INCLUDE] = 0;
+	im->sfcount[MCAST_EXCLUDE] = 1;
+	im->sources = NULL;
+	im->tomb = NULL;
+	im->crcount = 0;
+	atomic_set(&im->refcnt, 1);
+	spin_lock_init(&im->lock);
+#ifdef CONFIG_IP_MULTICAST
+	im->tm_running=0;
+	init_timer(&im->timer);
+	im->timer.data=(unsigned long)im;
+	im->timer.function=&igmp_timer_expire;
+	im->unsolicit_count = IGMP_Unsolicited_Report_Count;
+	im->reporter = 0;
+	im->gsquery = 0;
+#endif
+	im->loaded = 0;
+	write_lock_bh(&in_dev->mc_list_lock);
+	im->next=in_dev->mc_list;
+	in_dev->mc_list=im;
+	write_unlock_bh(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+	igmpv3_del_delrec(in_dev, im->multiaddr);
+#endif
+	igmp_group_added(im);
+	if (!in_dev->dead)
+		ip_rt_multicast_event(in_dev);
+out:
+	return;
+}
+
+/*
+ *	A socket has left a multicast group on device dev
+ */
+
+void ip_mc_dec_group(struct in_device *in_dev, u32 addr)
+{
+	struct ip_mc_list *i, **ip;
+	
+	ASSERT_RTNL();
+	
+	for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) {
+		if (i->multiaddr==addr) {
+			if (--i->users == 0) {
+				write_lock_bh(&in_dev->mc_list_lock);
+				*ip = i->next;
+				write_unlock_bh(&in_dev->mc_list_lock);
+				igmp_group_dropped(i);
+
+				if (!in_dev->dead)
+					ip_rt_multicast_event(in_dev);
+
+				ip_ma_put(i);
+				return;
+			}
+			break;
+		}
+	}
+}
+
+/* Device going down */
+
+void ip_mc_down(struct in_device *in_dev)
+{
+	struct ip_mc_list *i;
+
+	ASSERT_RTNL();
+
+	for (i=in_dev->mc_list; i; i=i->next)
+		igmp_group_dropped(i);
+
+#ifdef CONFIG_IP_MULTICAST
+	in_dev->mr_ifc_count = 0;
+	if (del_timer(&in_dev->mr_ifc_timer))
+		__in_dev_put(in_dev);
+	in_dev->mr_gq_running = 0;
+	if (del_timer(&in_dev->mr_gq_timer))
+		__in_dev_put(in_dev);
+	igmpv3_clear_delrec(in_dev);
+#endif
+
+	ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS);
+}
+
+void ip_mc_init_dev(struct in_device *in_dev)
+{
+	ASSERT_RTNL();
+
+	in_dev->mc_tomb = NULL;
+#ifdef CONFIG_IP_MULTICAST
+	in_dev->mr_gq_running = 0;
+	init_timer(&in_dev->mr_gq_timer);
+	in_dev->mr_gq_timer.data=(unsigned long) in_dev;
+	in_dev->mr_gq_timer.function=&igmp_gq_timer_expire;
+	in_dev->mr_ifc_count = 0;
+	init_timer(&in_dev->mr_ifc_timer);
+	in_dev->mr_ifc_timer.data=(unsigned long) in_dev;
+	in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire;
+	in_dev->mr_qrv = IGMP_Unsolicited_Report_Count;
+#endif
+
+	rwlock_init(&in_dev->mc_list_lock);
+	spin_lock_init(&in_dev->mc_tomb_lock);
+}
+
+/* Device going up */
+
+void ip_mc_up(struct in_device *in_dev)
+{
+	struct ip_mc_list *i;
+
+	ASSERT_RTNL();
+
+	ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
+
+	for (i=in_dev->mc_list; i; i=i->next)
+		igmp_group_added(i);
+}
+
+/*
+ *	Device is about to be destroyed: clean up.
+ */
+
+void ip_mc_destroy_dev(struct in_device *in_dev)
+{
+	struct ip_mc_list *i;
+
+	ASSERT_RTNL();
+
+	/* Deactivate timers */
+	ip_mc_down(in_dev);
+
+	write_lock_bh(&in_dev->mc_list_lock);
+	while ((i = in_dev->mc_list) != NULL) {
+		in_dev->mc_list = i->next;
+		write_unlock_bh(&in_dev->mc_list_lock);
+
+		igmp_group_dropped(i);
+		ip_ma_put(i);
+
+		write_lock_bh(&in_dev->mc_list_lock);
+	}
+	write_unlock_bh(&in_dev->mc_list_lock);
+}
+
+static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
+{
+	struct flowi fl = { .nl_u = { .ip4_u =
+				      { .daddr = imr->imr_multiaddr.s_addr } } };
+	struct rtable *rt;
+	struct net_device *dev = NULL;
+	struct in_device *idev = NULL;
+
+	if (imr->imr_ifindex) {
+		idev = inetdev_by_index(imr->imr_ifindex);
+		if (idev)
+			__in_dev_put(idev);
+		return idev;
+	}
+	if (imr->imr_address.s_addr) {
+		dev = ip_dev_find(imr->imr_address.s_addr);
+		if (!dev)
+			return NULL;
+		__dev_put(dev);
+	}
+
+	if (!dev && !ip_route_output_key(&rt, &fl)) {
+		dev = rt->u.dst.dev;
+		ip_rt_put(rt);
+	}
+	if (dev) {
+		imr->imr_ifindex = dev->ifindex;
+		idev = __in_dev_get(dev);
+	}
+	return idev;
+}
+
+/*
+ *	Join a socket to a group
+ */
+int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS;
+int sysctl_igmp_max_msf = IP_MAX_MSF;
+
+
+static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
+	__u32 *psfsrc)
+{
+	struct ip_sf_list *psf, *psf_prev;
+	int rv = 0;
+
+	psf_prev = NULL;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (psf->sf_inaddr == *psfsrc)
+			break;
+		psf_prev = psf;
+	}
+	if (!psf || psf->sf_count[sfmode] == 0) {
+		/* source filter not found, or count wrong =>  bug */
+		return -ESRCH;
+	}
+	psf->sf_count[sfmode]--;
+	if (psf->sf_count[sfmode] == 0) {
+		ip_rt_multicast_event(pmc->interface);
+	}
+	if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+		struct in_device *in_dev = pmc->interface;
+#endif
+
+		/* no more filters for this source */
+		if (psf_prev)
+			psf_prev->sf_next = psf->sf_next;
+		else
+			pmc->sources = psf->sf_next;
+#ifdef CONFIG_IP_MULTICAST
+		if (psf->sf_oldin &&
+		    !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
+			psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 
+				IGMP_Unsolicited_Report_Count;
+			psf->sf_next = pmc->tomb;
+			pmc->tomb = psf;
+			rv = 1;
+		} else
+#endif
+			kfree(psf);
+	}
+	return rv;
+}
+
+#ifndef CONFIG_IP_MULTICAST
+#define igmp_ifc_event(x)	do { } while (0)
+#endif
+
+static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+			 int sfcount, __u32 *psfsrc, int delta)
+{
+	struct ip_mc_list *pmc;
+	int	changerec = 0;
+	int	i, err;
+
+	if (!in_dev)
+		return -ENODEV;
+	read_lock(&in_dev->mc_list_lock);
+	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+		if (*pmca == pmc->multiaddr)
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		read_unlock(&in_dev->mc_list_lock);
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->lock);
+	read_unlock(&in_dev->mc_list_lock);
+#ifdef CONFIG_IP_MULTICAST
+	sf_markstate(pmc);
+#endif
+	if (!delta) {
+		err = -EINVAL;
+		if (!pmc->sfcount[sfmode])
+			goto out_unlock;
+		pmc->sfcount[sfmode]--;
+	}
+	err = 0;
+	for (i=0; i<sfcount; i++) {
+		int rv = ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+
+		changerec |= rv > 0;
+		if (!err && rv < 0)
+			err = rv;
+	}
+	if (pmc->sfmode == MCAST_EXCLUDE &&
+	    pmc->sfcount[MCAST_EXCLUDE] == 0 &&
+	    pmc->sfcount[MCAST_INCLUDE]) {
+#ifdef CONFIG_IP_MULTICAST
+		struct ip_sf_list *psf;
+#endif
+
+		/* filter mode change */
+		pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 
+			IGMP_Unsolicited_Report_Count;
+		in_dev->mr_ifc_count = pmc->crcount;
+		for (psf=pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		igmp_ifc_event(pmc->interface);
+	} else if (sf_setstate(pmc) || changerec) {
+		igmp_ifc_event(pmc->interface);
+#endif
+	}
+out_unlock:
+	spin_unlock_bh(&pmc->lock);
+	return err;
+}
+
+/*
+ * Add multicast single-source filter to the interface list
+ */
+static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode,
+	__u32 *psfsrc, int delta)
+{
+	struct ip_sf_list *psf, *psf_prev;
+
+	psf_prev = NULL;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (psf->sf_inaddr == *psfsrc)
+			break;
+		psf_prev = psf;
+	}
+	if (!psf) {
+		psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC);
+		if (!psf)
+			return -ENOBUFS;
+		memset(psf, 0, sizeof(*psf));
+		psf->sf_inaddr = *psfsrc;
+		if (psf_prev) {
+			psf_prev->sf_next = psf;
+		} else
+			pmc->sources = psf;
+	}
+	psf->sf_count[sfmode]++;
+	if (psf->sf_count[sfmode] == 1) {
+		ip_rt_multicast_event(pmc->interface);
+	}
+	return 0;
+}
+
+#ifdef CONFIG_IP_MULTICAST
+static void sf_markstate(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf;
+	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+
+	for (psf=pmc->sources; psf; psf=psf->sf_next)
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			psf->sf_oldin = mca_xcount ==
+				psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0;
+}
+
+static int sf_setstate(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf;
+	int mca_xcount = pmc->sfcount[MCAST_EXCLUDE];
+	int qrv = pmc->interface->mr_qrv;
+	int new_in, rv;
+
+	rv = 0;
+	for (psf=pmc->sources; psf; psf=psf->sf_next) {
+		if (pmc->sfcount[MCAST_EXCLUDE]) {
+			new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] &&
+				!psf->sf_count[MCAST_INCLUDE];
+		} else
+			new_in = psf->sf_count[MCAST_INCLUDE] != 0;
+		if (new_in != psf->sf_oldin) {
+			psf->sf_crcount = qrv;
+			rv++;
+		}
+	}
+	return rv;
+}
+#endif
+
+/*
+ * Add multicast source filter list to the interface list
+ */
+static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode,
+			 int sfcount, __u32 *psfsrc, int delta)
+{
+	struct ip_mc_list *pmc;
+	int	isexclude;
+	int	i, err;
+
+	if (!in_dev)
+		return -ENODEV;
+	read_lock(&in_dev->mc_list_lock);
+	for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) {
+		if (*pmca == pmc->multiaddr)
+			break;
+	}
+	if (!pmc) {
+		/* MCA not found?? bug */
+		read_unlock(&in_dev->mc_list_lock);
+		return -ESRCH;
+	}
+	spin_lock_bh(&pmc->lock);
+	read_unlock(&in_dev->mc_list_lock);
+
+#ifdef CONFIG_IP_MULTICAST
+	sf_markstate(pmc);
+#endif
+	isexclude = pmc->sfmode == MCAST_EXCLUDE;
+	if (!delta)
+		pmc->sfcount[sfmode]++;
+	err = 0;
+	for (i=0; i<sfcount; i++) {
+		err = ip_mc_add1_src(pmc, sfmode, &psfsrc[i], delta);
+		if (err)
+			break;
+	}
+	if (err) {
+		int j;
+
+		pmc->sfcount[sfmode]--;
+		for (j=0; j<i; j++)
+			(void) ip_mc_del1_src(pmc, sfmode, &psfsrc[i]);
+	} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
+#ifdef CONFIG_IP_MULTICAST
+		struct in_device *in_dev = pmc->interface;
+		struct ip_sf_list *psf;
+#endif
+
+		/* filter mode change */
+		if (pmc->sfcount[MCAST_EXCLUDE])
+			pmc->sfmode = MCAST_EXCLUDE;
+		else if (pmc->sfcount[MCAST_INCLUDE])
+			pmc->sfmode = MCAST_INCLUDE;
+#ifdef CONFIG_IP_MULTICAST
+		/* else no filters; keep old mode for reports */
+
+		pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : 
+			IGMP_Unsolicited_Report_Count;
+		in_dev->mr_ifc_count = pmc->crcount;
+		for (psf=pmc->sources; psf; psf = psf->sf_next)
+			psf->sf_crcount = 0;
+		igmp_ifc_event(in_dev);
+	} else if (sf_setstate(pmc)) {
+		igmp_ifc_event(in_dev);
+#endif
+	}
+	spin_unlock_bh(&pmc->lock);
+	return err;
+}
+
+static void ip_mc_clear_src(struct ip_mc_list *pmc)
+{
+	struct ip_sf_list *psf, *nextpsf;
+
+	for (psf=pmc->tomb; psf; psf=nextpsf) {
+		nextpsf = psf->sf_next;
+		kfree(psf);
+	}
+	pmc->tomb = NULL;
+	for (psf=pmc->sources; psf; psf=nextpsf) {
+		nextpsf = psf->sf_next;
+		kfree(psf);
+	}
+	pmc->sources = NULL;
+	pmc->sfmode = MCAST_EXCLUDE;
+	pmc->sfcount[MCAST_EXCLUDE] = 0;
+	pmc->sfcount[MCAST_EXCLUDE] = 1;
+}
+
+
+/*
+ * Join a multicast group
+ */
+int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
+{
+	int err;
+	u32 addr = imr->imr_multiaddr.s_addr;
+	struct ip_mc_socklist *iml, *i;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	int count = 0;
+
+	if (!MULTICAST(addr))
+		return -EINVAL;
+
+	rtnl_shlock();
+
+	in_dev = ip_mc_find_dev(imr);
+
+	if (!in_dev) {
+		iml = NULL;
+		err = -ENODEV;
+		goto done;
+	}
+
+	iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
+
+	err = -EADDRINUSE;
+	for (i = inet->mc_list; i; i = i->next) {
+		if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) {
+			/* New style additions are reference counted */
+			if (imr->imr_address.s_addr == 0) {
+				i->count++;
+				err = 0;
+			}
+			goto done;
+		}
+		count++;
+	}
+	err = -ENOBUFS;
+	if (iml == NULL || count >= sysctl_igmp_max_memberships)
+		goto done;
+	memcpy(&iml->multi, imr, sizeof(*imr));
+	iml->next = inet->mc_list;
+	iml->count = 1;
+	iml->sflist = NULL;
+	iml->sfmode = MCAST_EXCLUDE;
+	inet->mc_list = iml;
+	ip_mc_inc_group(in_dev, addr);
+	iml = NULL;
+	err = 0;
+
+done:
+	rtnl_shunlock();
+	if (iml)
+		sock_kfree_s(sk, iml, sizeof(*iml));
+	return err;
+}
+
+static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
+			   struct in_device *in_dev)
+{
+	int err;
+
+	if (iml->sflist == 0) {
+		/* any-source empty exclude case */
+		return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+			iml->sfmode, 0, NULL, 0);
+	}
+	err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
+			iml->sfmode, iml->sflist->sl_count,
+			iml->sflist->sl_addr, 0);
+	sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
+	iml->sflist = NULL;
+	return err;
+}
+
+/*
+ *	Ask a socket to leave a group.
+ */
+
+int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml, **imlp;
+
+	rtnl_lock();
+	for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) {
+		if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr &&
+		    iml->multi.imr_address.s_addr==imr->imr_address.s_addr &&
+		    (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) {
+			struct in_device *in_dev;
+
+			in_dev = inetdev_by_index(iml->multi.imr_ifindex);
+			if (in_dev)
+				(void) ip_mc_leave_src(sk, iml, in_dev);
+			if (--iml->count) {
+				rtnl_unlock();
+				if (in_dev)
+					in_dev_put(in_dev);
+				return 0;
+			}
+
+			*imlp = iml->next;
+
+			if (in_dev) {
+				ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr);
+				in_dev_put(in_dev);
+			}
+			rtnl_unlock();
+			sock_kfree_s(sk, iml, sizeof(*iml));
+			return 0;
+		}
+	}
+	rtnl_unlock();
+	return -EADDRNOTAVAIL;
+}
+
+int ip_mc_source(int add, int omode, struct sock *sk, struct
+	ip_mreq_source *mreqs, int ifindex)
+{
+	int err;
+	struct ip_mreqn imr;
+	u32 addr = mreqs->imr_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+	int i, j, rv;
+
+	if (!MULTICAST(addr))
+		return -EINVAL;
+
+	rtnl_shlock();
+
+	imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr;
+	imr.imr_address.s_addr = mreqs->imr_interface;
+	imr.imr_ifindex = ifindex;
+	in_dev = ip_mc_find_dev(&imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+		if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	/* if a source filter was set, must be the same mode as before */
+	if (pmc->sflist) {
+		if (pmc->sfmode != omode)
+			goto done;
+	} else if (pmc->sfmode != omode) {
+		/* allow mode switches for empty-set filters */
+		ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0);
+		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, 
+			NULL, 0);
+		pmc->sfmode = omode;
+	}
+
+	psl = pmc->sflist;
+	if (!add) {
+		if (!psl)
+			goto done;
+		rv = !0;
+		for (i=0; i<psl->sl_count; i++) {
+			rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+				sizeof(__u32));
+			if (rv == 0)
+				break;
+		}
+		if (rv)		/* source not found */
+			goto done;
+
+		/* update the interface filter */
+		ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
+			&mreqs->imr_sourceaddr, 1);
+
+		for (j=i+1; j<psl->sl_count; j++)
+			psl->sl_addr[j-1] = psl->sl_addr[j];
+		psl->sl_count--;
+		err = 0;
+		goto done;
+	}
+	/* else, add a new source to the filter */
+
+	if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+		err = -ENOBUFS;
+		goto done;
+	}
+	if (!psl || psl->sl_count == psl->sl_max) {
+		struct ip_sf_socklist *newpsl;
+		int count = IP_SFBLOCK;
+
+		if (psl)
+			count += psl->sl_max;
+		newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+			IP_SFLSIZE(count), GFP_KERNEL);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = count;
+		newpsl->sl_count = count - IP_SFBLOCK;
+		if (psl) {
+			for (i=0; i<psl->sl_count; i++)
+				newpsl->sl_addr[i] = psl->sl_addr[i];
+			sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+		}
+		pmc->sflist = psl = newpsl;
+	}
+	rv = 1;	/* > 0 for insert logic below if sl_count is 0 */
+	for (i=0; i<psl->sl_count; i++) {
+		rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr,
+			sizeof(__u32));
+		if (rv == 0)
+			break;
+	}
+	if (rv == 0)		/* address already there is an error */
+		goto done;
+	for (j=psl->sl_count-1; j>=i; j--)
+		psl->sl_addr[j+1] = psl->sl_addr[j];
+	psl->sl_addr[i] = mreqs->imr_sourceaddr;
+	psl->sl_count++;
+	err = 0;
+	/* update the interface list */
+	ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, 
+		&mreqs->imr_sourceaddr, 1);
+done:
+	rtnl_shunlock();
+	return err;
+}
+
+int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
+{
+	int err;
+	struct ip_mreqn	imr;
+	u32 addr = msf->imsf_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *newpsl, *psl;
+
+	if (!MULTICAST(addr))
+		return -EINVAL;
+	if (msf->imsf_fmode != MCAST_INCLUDE &&
+	    msf->imsf_fmode != MCAST_EXCLUDE)
+		return -EINVAL;
+
+	rtnl_shlock();
+
+	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+	imr.imr_address.s_addr = msf->imsf_interface;
+	imr.imr_ifindex = ifindex;
+	in_dev = ip_mc_find_dev(&imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+		    pmc->multi.imr_ifindex == imr.imr_ifindex)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	if (msf->imsf_numsrc) {
+		newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk,
+				IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL);
+		if (!newpsl) {
+			err = -ENOBUFS;
+			goto done;
+		}
+		newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc;
+		memcpy(newpsl->sl_addr, msf->imsf_slist,
+			msf->imsf_numsrc * sizeof(msf->imsf_slist[0]));
+		err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr,
+			msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0);
+		if (err) {
+			sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max));
+			goto done;
+		}
+	} else
+		newpsl = NULL;
+	psl = pmc->sflist;
+	if (psl) {
+		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+			psl->sl_count, psl->sl_addr, 0);
+		sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+	} else
+		(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
+			0, NULL, 0);
+	pmc->sflist = newpsl;
+	pmc->sfmode = msf->imsf_fmode;
+done:
+	rtnl_shunlock();
+	return err;
+}
+
+int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
+	struct ip_msfilter __user *optval, int __user *optlen)
+{
+	int err, len, count, copycount;
+	struct ip_mreqn	imr;
+	u32 addr = msf->imsf_multiaddr;
+	struct ip_mc_socklist *pmc;
+	struct in_device *in_dev;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+
+	if (!MULTICAST(addr))
+		return -EINVAL;
+
+	rtnl_shlock();
+
+	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
+	imr.imr_address.s_addr = msf->imsf_interface;
+	imr.imr_ifindex = 0;
+	in_dev = ip_mc_find_dev(&imr);
+
+	if (!in_dev) {
+		err = -ENODEV;
+		goto done;
+	}
+	err = -EADDRNOTAVAIL;
+
+	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+		if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr &&
+		    pmc->multi.imr_ifindex == imr.imr_ifindex)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	msf->imsf_fmode = pmc->sfmode;
+	psl = pmc->sflist;
+	rtnl_shunlock();
+	if (!psl) {
+		len = 0;
+		count = 0;
+	} else {
+		count = psl->sl_count;
+	}
+	copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc;
+	len = copycount * sizeof(psl->sl_addr[0]);
+	msf->imsf_numsrc = count;
+	if (put_user(IP_MSFILTER_SIZE(copycount), optlen) ||
+	    copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) {
+		return -EFAULT;
+	}
+	if (len &&
+	    copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len))
+		return -EFAULT;
+	return 0;
+done:
+	rtnl_shunlock();
+	return err;
+}
+
+int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
+	struct group_filter __user *optval, int __user *optlen)
+{
+	int err, i, count, copycount;
+	struct sockaddr_in *psin;
+	u32 addr;
+	struct ip_mc_socklist *pmc;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_sf_socklist *psl;
+
+	psin = (struct sockaddr_in *)&gsf->gf_group;
+	if (psin->sin_family != AF_INET)
+		return -EINVAL;
+	addr = psin->sin_addr.s_addr;
+	if (!MULTICAST(addr))
+		return -EINVAL;
+
+	rtnl_shlock();
+
+	err = -EADDRNOTAVAIL;
+
+	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+		if (pmc->multi.imr_multiaddr.s_addr == addr &&
+		    pmc->multi.imr_ifindex == gsf->gf_interface)
+			break;
+	}
+	if (!pmc)		/* must have a prior join */
+		goto done;
+	gsf->gf_fmode = pmc->sfmode;
+	psl = pmc->sflist;
+	rtnl_shunlock();
+	count = psl ? psl->sl_count : 0;
+	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
+	gsf->gf_numsrc = count;
+	if (put_user(GROUP_FILTER_SIZE(copycount), optlen) ||
+	    copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) {
+		return -EFAULT;
+	}
+	for (i=0; i<copycount; i++) {
+		struct sockaddr_in *psin;
+		struct sockaddr_storage ss;
+
+		psin = (struct sockaddr_in *)&ss;
+		memset(&ss, 0, sizeof(ss));
+		psin->sin_family = AF_INET;
+		psin->sin_addr.s_addr = psl->sl_addr[i];
+		if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss)))
+			return -EFAULT;
+	}
+	return 0;
+done:
+	rtnl_shunlock();
+	return err;
+}
+
+/*
+ * check if a multicast source filter allows delivery for a given <src,dst,intf>
+ */
+int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *pmc;
+	struct ip_sf_socklist *psl;
+	int i;
+
+	if (!MULTICAST(loc_addr))
+		return 1;
+
+	for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+		if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
+		    pmc->multi.imr_ifindex == dif)
+			break;
+	}
+	if (!pmc)
+		return 1;
+	psl = pmc->sflist;
+	if (!psl)
+		return pmc->sfmode == MCAST_EXCLUDE;
+
+	for (i=0; i<psl->sl_count; i++) {
+		if (psl->sl_addr[i] == rmt_addr)
+			break;
+	}
+	if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
+		return 0;
+	if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
+		return 0;
+	return 1;
+}
+
+/*
+ *	A socket is closing.
+ */
+
+void ip_mc_drop_socket(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_mc_socklist *iml;
+
+	if (inet->mc_list == NULL)
+		return;
+
+	rtnl_lock();
+	while ((iml = inet->mc_list) != NULL) {
+		struct in_device *in_dev;
+		inet->mc_list = iml->next;
+
+		if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) {
+			(void) ip_mc_leave_src(sk, iml, in_dev);
+			ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
+			in_dev_put(in_dev);
+		}
+		sock_kfree_s(sk, iml, sizeof(*iml));
+
+	}
+	rtnl_unlock();
+}
+
+int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto)
+{
+	struct ip_mc_list *im;
+	struct ip_sf_list *psf;
+	int rv = 0;
+
+	read_lock(&in_dev->mc_list_lock);
+	for (im=in_dev->mc_list; im; im=im->next) {
+		if (im->multiaddr == mc_addr)
+			break;
+	}
+	if (im && proto == IPPROTO_IGMP) {
+		rv = 1;
+	} else if (im) {
+		if (src_addr) {
+			for (psf=im->sources; psf; psf=psf->sf_next) {
+				if (psf->sf_inaddr == src_addr)
+					break;
+			}
+			if (psf)
+				rv = psf->sf_count[MCAST_INCLUDE] ||
+					psf->sf_count[MCAST_EXCLUDE] !=
+					im->sfcount[MCAST_EXCLUDE];
+			else
+				rv = im->sfcount[MCAST_EXCLUDE] != 0;
+		} else
+			rv = 1; /* unspecified source; tentatively allow */
+	}
+	read_unlock(&in_dev->mc_list_lock);
+	return rv;
+}
+
+#if defined(CONFIG_PROC_FS)
+struct igmp_mc_iter_state {
+	struct net_device *dev;
+	struct in_device *in_dev;
+};
+
+#define	igmp_mc_seq_private(seq)	((struct igmp_mc_iter_state *)(seq)->private)
+
+static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
+{
+	struct ip_mc_list *im = NULL;
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+
+	for (state->dev = dev_base, state->in_dev = NULL;
+	     state->dev; 
+	     state->dev = state->dev->next) {
+		struct in_device *in_dev;
+		in_dev = in_dev_get(state->dev);
+		if (!in_dev)
+			continue;
+		read_lock(&in_dev->mc_list_lock);
+		im = in_dev->mc_list;
+		if (im) {
+			state->in_dev = in_dev;
+			break;
+		}
+		read_unlock(&in_dev->mc_list_lock);
+		in_dev_put(in_dev);
+	}
+	return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im)
+{
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+	im = im->next;
+	while (!im) {
+		if (likely(state->in_dev != NULL)) {
+			read_unlock(&state->in_dev->mc_list_lock);
+			in_dev_put(state->in_dev);
+		}
+		state->dev = state->dev->next;
+		if (!state->dev) {
+			state->in_dev = NULL;
+			break;
+		}
+		state->in_dev = in_dev_get(state->dev);
+		if (!state->in_dev)
+			continue;
+		read_lock(&state->in_dev->mc_list_lock);
+		im = state->in_dev->mc_list;
+	}
+	return im;
+}
+
+static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip_mc_list *im = igmp_mc_get_first(seq);
+	if (im)
+		while (pos && (im = igmp_mc_get_next(seq, im)) != NULL)
+			--pos;
+	return pos ? NULL : im;
+}
+
+static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock(&dev_base_lock);
+	return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_mc_list *im;
+	if (v == SEQ_START_TOKEN)
+		im = igmp_mc_get_first(seq);
+	else
+		im = igmp_mc_get_next(seq, v);
+	++*pos;
+	return im;
+}
+
+static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
+{
+	struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+	if (likely(state->in_dev != NULL)) {
+		read_unlock(&state->in_dev->mc_list_lock);
+		in_dev_put(state->in_dev);
+		state->in_dev = NULL;
+	}
+	state->dev = NULL;
+	read_unlock(&dev_base_lock);
+}
+
+static int igmp_mc_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, 
+			 "Idx\tDevice    : Count Querier\tGroup    Users Timer\tReporter\n");
+	else {
+		struct ip_mc_list *im = (struct ip_mc_list *)v;
+		struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
+		char   *querier;
+#ifdef CONFIG_IP_MULTICAST
+		querier = IGMP_V1_SEEN(state->in_dev) ? "V1" :
+			  IGMP_V2_SEEN(state->in_dev) ? "V2" :
+			  "V3";
+#else
+		querier = "NONE";
+#endif
+
+		if (state->in_dev->mc_list == im) {
+			seq_printf(seq, "%d\t%-10s: %5d %7s\n",
+				   state->dev->ifindex, state->dev->name, state->dev->mc_count, querier);
+		}
+
+		seq_printf(seq,
+			   "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n",
+			   im->multiaddr, im->users,
+			   im->tm_running, im->tm_running ?
+			   jiffies_to_clock_t(im->timer.expires-jiffies) : 0,
+			   im->reporter);
+	}
+	return 0;
+}
+
+static struct seq_operations igmp_mc_seq_ops = {
+	.start	=	igmp_mc_seq_start,
+	.next	=	igmp_mc_seq_next,
+	.stop	=	igmp_mc_seq_stop,
+	.show	=	igmp_mc_seq_show,
+};
+
+static int igmp_mc_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		goto out;
+	rc = seq_open(file, &igmp_mc_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq = file->private_data;
+	seq->private = s;
+	memset(s, 0, sizeof(*s));
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations igmp_mc_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	igmp_mc_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_private,
+};
+
+struct igmp_mcf_iter_state {
+	struct net_device *dev;
+	struct in_device *idev;
+	struct ip_mc_list *im;
+};
+
+#define igmp_mcf_seq_private(seq)	((struct igmp_mcf_iter_state *)(seq)->private)
+
+static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
+{
+	struct ip_sf_list *psf = NULL;
+	struct ip_mc_list *im = NULL;
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	for (state->dev = dev_base, state->idev = NULL, state->im = NULL;
+	     state->dev; 
+	     state->dev = state->dev->next) {
+		struct in_device *idev;
+		idev = in_dev_get(state->dev);
+		if (unlikely(idev == NULL))
+			continue;
+		read_lock(&idev->mc_list_lock);
+		im = idev->mc_list;
+		if (likely(im != NULL)) {
+			spin_lock_bh(&im->lock);
+			psf = im->sources;
+			if (likely(psf != NULL)) {
+				state->im = im;
+				state->idev = idev;
+				break;
+			}
+			spin_unlock_bh(&im->lock);
+		}
+		read_unlock(&idev->mc_list_lock);
+		in_dev_put(idev);
+	}
+	return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf)
+{
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	psf = psf->sf_next;
+	while (!psf) {
+		spin_unlock_bh(&state->im->lock);
+		state->im = state->im->next;
+		while (!state->im) {
+			if (likely(state->idev != NULL)) {
+				read_unlock(&state->idev->mc_list_lock);
+				in_dev_put(state->idev);
+			}
+			state->dev = state->dev->next;
+			if (!state->dev) {
+				state->idev = NULL;
+				goto out;
+			}
+			state->idev = in_dev_get(state->dev);
+			if (!state->idev)
+				continue;
+			read_lock(&state->idev->mc_list_lock);
+			state->im = state->idev->mc_list;
+		}
+		if (!state->im)
+			break;
+		spin_lock_bh(&state->im->lock);
+		psf = state->im->sources;
+	}
+out:
+	return psf;
+}
+
+static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct ip_sf_list *psf = igmp_mcf_get_first(seq);
+	if (psf)
+		while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL)
+			--pos;
+	return pos ? NULL : psf;
+}
+
+static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock(&dev_base_lock);
+	return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_sf_list *psf;
+	if (v == SEQ_START_TOKEN)
+		psf = igmp_mcf_get_first(seq);
+	else
+		psf = igmp_mcf_get_next(seq, v);
+	++*pos;
+	return psf;
+}
+
+static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+{
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+	if (likely(state->im != NULL)) {
+		spin_unlock_bh(&state->im->lock);
+		state->im = NULL;
+	}
+	if (likely(state->idev != NULL)) {
+		read_unlock(&state->idev->mc_list_lock);
+		in_dev_put(state->idev);
+		state->idev = NULL;
+	}
+	state->dev = NULL;
+	read_unlock(&dev_base_lock);
+}
+
+static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
+{
+	struct ip_sf_list *psf = (struct ip_sf_list *)v;
+	struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, 
+			   "%3s %6s "
+			   "%10s %10s %6s %6s\n", "Idx",
+			   "Device", "MCA",
+			   "SRC", "INC", "EXC");
+	} else {
+		seq_printf(seq,
+			   "%3d %6.6s 0x%08x "
+			   "0x%08x %6lu %6lu\n", 
+			   state->dev->ifindex, state->dev->name, 
+			   ntohl(state->im->multiaddr),
+			   ntohl(psf->sf_inaddr),
+			   psf->sf_count[MCAST_INCLUDE],
+			   psf->sf_count[MCAST_EXCLUDE]);
+	}
+	return 0;
+}
+
+static struct seq_operations igmp_mcf_seq_ops = {
+	.start	=	igmp_mcf_seq_start,
+	.next	=	igmp_mcf_seq_next,
+	.stop	=	igmp_mcf_seq_stop,
+	.show	=	igmp_mcf_seq_show,
+};
+
+static int igmp_mcf_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		goto out;
+	rc = seq_open(file, &igmp_mcf_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq = file->private_data;
+	seq->private = s;
+	memset(s, 0, sizeof(*s));
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations igmp_mcf_seq_fops = {
+	.owner		=	THIS_MODULE,
+	.open		=	igmp_mcf_seq_open,
+	.read		=	seq_read,
+	.llseek		=	seq_lseek,
+	.release	=	seq_release_private,
+};
+
+int __init igmp_mc_proc_init(void)
+{
+	proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops);
+	proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops);
+	return 0;
+}
+#endif
+
+EXPORT_SYMBOL(ip_mc_dec_group);
+EXPORT_SYMBOL(ip_mc_inc_group);
+EXPORT_SYMBOL(ip_mc_join_group);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
new file mode 100644
index 000000000000..95473953c406
--- /dev/null
+++ b/net/ipv4/inetpeer.c
@@ -0,0 +1,460 @@
+/*
+ *		INETPEER - A storage for permanent information about peers
+ *
+ *  This source is covered by the GNU GPL, the same as all kernel sources.
+ *
+ *  Version:	$Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $
+ *
+ *  Authors:	Andrey V. Savochkin <saw@msu.ru>
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/net.h>
+#include <net/inetpeer.h>
+
+/*
+ *  Theory of operations.
+ *  We keep one entry for each peer IP address.  The nodes contains long-living
+ *  information about the peer which doesn't depend on routes.
+ *  At this moment this information consists only of ID field for the next
+ *  outgoing IP packet.  This field is incremented with each packet as encoded
+ *  in inet_getid() function (include/net/inetpeer.h).
+ *  At the moment of writing this notes identifier of IP packets is generated
+ *  to be unpredictable using this code only for packets subjected
+ *  (actually or potentially) to defragmentation.  I.e. DF packets less than
+ *  PMTU in size uses a constant ID and do not use this code (see
+ *  ip_select_ident() in include/net/ip.h).
+ *
+ *  Route cache entries hold references to our nodes.
+ *  New cache entries get references via lookup by destination IP address in
+ *  the avl tree.  The reference is grabbed only when it's needed i.e. only
+ *  when we try to output IP packet which needs an unpredictable ID (see
+ *  __ip_select_ident() in net/ipv4/route.c).
+ *  Nodes are removed only when reference counter goes to 0.
+ *  When it's happened the node may be removed when a sufficient amount of
+ *  time has been passed since its last use.  The less-recently-used entry can
+ *  also be removed if the pool is overloaded i.e. if the total amount of
+ *  entries is greater-or-equal than the threshold.
+ *
+ *  Node pool is organised as an AVL tree.
+ *  Such an implementation has been chosen not just for fun.  It's a way to
+ *  prevent easy and efficient DoS attacks by creating hash collisions.  A huge
+ *  amount of long living nodes in a single hash slot would significantly delay
+ *  lookups performed with disabled BHs.
+ *
+ *  Serialisation issues.
+ *  1.  Nodes may appear in the tree only with the pool write lock held.
+ *  2.  Nodes may disappear from the tree only with the pool write lock held
+ *      AND reference count being 0.
+ *  3.  Nodes appears and disappears from unused node list only under
+ *      "inet_peer_unused_lock".
+ *  4.  Global variable peer_total is modified under the pool lock.
+ *  5.  struct inet_peer fields modification:
+ *		avl_left, avl_right, avl_parent, avl_height: pool lock
+ *		unused_next, unused_prevp: unused node list lock
+ *		refcnt: atomically against modifications on other CPU;
+ *		   usually under some other lock to prevent node disappearing
+ *		dtime: unused node list lock
+ *		v4daddr: unchangeable
+ *		ip_id_count: idlock
+ */
+
+/* Exported for inet_getid inline function.  */
+DEFINE_SPINLOCK(inet_peer_idlock);
+
+static kmem_cache_t *peer_cachep;
+
+#define node_height(x) x->avl_height
+static struct inet_peer peer_fake_node = {
+	.avl_left	= &peer_fake_node,
+	.avl_right	= &peer_fake_node,
+	.avl_height	= 0
+};
+#define peer_avl_empty (&peer_fake_node)
+static struct inet_peer *peer_root = peer_avl_empty;
+static DEFINE_RWLOCK(peer_pool_lock);
+#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */
+
+static volatile int peer_total;
+/* Exported for sysctl_net_ipv4.  */
+int inet_peer_threshold = 65536 + 128;	/* start to throw entries more
+					 * aggressively at this stage */
+int inet_peer_minttl = 120 * HZ;	/* TTL under high load: 120 sec */
+int inet_peer_maxttl = 10 * 60 * HZ;	/* usual time to live: 10 min */
+
+static struct inet_peer *inet_peer_unused_head;
+/* Exported for inet_putpeer inline function.  */
+struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head;
+DEFINE_SPINLOCK(inet_peer_unused_lock);
+#define PEER_MAX_CLEANUP_WORK 30
+
+static void peer_check_expire(unsigned long dummy);
+static struct timer_list peer_periodic_timer =
+	TIMER_INITIALIZER(peer_check_expire, 0, 0);
+
+/* Exported for sysctl_net_ipv4.  */
+int inet_peer_gc_mintime = 10 * HZ,
+    inet_peer_gc_maxtime = 120 * HZ;
+
+/* Called from ip_output.c:ip_init  */
+void __init inet_initpeers(void)
+{
+	struct sysinfo si;
+
+	/* Use the straight interface to information about memory. */
+	si_meminfo(&si);
+	/* The values below were suggested by Alexey Kuznetsov
+	 * <kuznet@ms2.inr.ac.ru>.  I don't have any opinion about the values
+	 * myself.  --SAW
+	 */
+	if (si.totalram <= (32768*1024)/PAGE_SIZE)
+		inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */
+	if (si.totalram <= (16384*1024)/PAGE_SIZE)
+		inet_peer_threshold >>= 1; /* about 512KB */
+	if (si.totalram <= (8192*1024)/PAGE_SIZE)
+		inet_peer_threshold >>= 2; /* about 128KB */
+
+	peer_cachep = kmem_cache_create("inet_peer_cache",
+			sizeof(struct inet_peer),
+			0, SLAB_HWCACHE_ALIGN,
+			NULL, NULL);
+
+	if (!peer_cachep)
+		panic("cannot create inet_peer_cache");
+
+	/* All the timers, started at system startup tend
+	   to synchronize. Perturb it a bit.
+	 */
+	peer_periodic_timer.expires = jiffies
+		+ net_random() % inet_peer_gc_maxtime
+		+ inet_peer_gc_maxtime;
+	add_timer(&peer_periodic_timer);
+}
+
+/* Called with or without local BH being disabled. */
+static void unlink_from_unused(struct inet_peer *p)
+{
+	spin_lock_bh(&inet_peer_unused_lock);
+	if (p->unused_prevp != NULL) {
+		/* On unused list. */
+		*p->unused_prevp = p->unused_next;
+		if (p->unused_next != NULL)
+			p->unused_next->unused_prevp = p->unused_prevp;
+		else
+			inet_peer_unused_tailp = p->unused_prevp;
+		p->unused_prevp = NULL; /* mark it as removed */
+	}
+	spin_unlock_bh(&inet_peer_unused_lock);
+}
+
+/* Called with local BH disabled and the pool lock held. */
+#define lookup(daddr) 						\
+({								\
+	struct inet_peer *u, **v;				\
+	stackptr = stack;					\
+	*stackptr++ = &peer_root;				\
+	for (u = peer_root; u != peer_avl_empty; ) {		\
+		if (daddr == u->v4daddr)			\
+			break;					\
+		if (daddr < u->v4daddr)				\
+			v = &u->avl_left;			\
+		else						\
+			v = &u->avl_right;			\
+		*stackptr++ = v;				\
+		u = *v;						\
+	}							\
+	u;							\
+})
+
+/* Called with local BH disabled and the pool write lock held. */
+#define lookup_rightempty(start)				\
+({								\
+	struct inet_peer *u, **v;				\
+	*stackptr++ = &start->avl_left;				\
+	v = &start->avl_left;					\
+	for (u = *v; u->avl_right != peer_avl_empty; ) {	\
+		v = &u->avl_right;				\
+		*stackptr++ = v;				\
+		u = *v;						\
+	}							\
+	u;							\
+})
+
+/* Called with local BH disabled and the pool write lock held.
+ * Variable names are the proof of operation correctness.
+ * Look into mm/map_avl.c for more detail description of the ideas.  */
+static void peer_avl_rebalance(struct inet_peer **stack[],
+		struct inet_peer ***stackend)
+{
+	struct inet_peer **nodep, *node, *l, *r;
+	int lh, rh;
+
+	while (stackend > stack) {
+		nodep = *--stackend;
+		node = *nodep;
+		l = node->avl_left;
+		r = node->avl_right;
+		lh = node_height(l);
+		rh = node_height(r);
+		if (lh > rh + 1) { /* l: RH+2 */
+			struct inet_peer *ll, *lr, *lrl, *lrr;
+			int lrh;
+			ll = l->avl_left;
+			lr = l->avl_right;
+			lrh = node_height(lr);
+			if (lrh <= node_height(ll)) {	/* ll: RH+1 */
+				node->avl_left = lr;	/* lr: RH or RH+1 */
+				node->avl_right = r;	/* r: RH */
+				node->avl_height = lrh + 1; /* RH+1 or RH+2 */
+				l->avl_left = ll;	/* ll: RH+1 */
+				l->avl_right = node;	/* node: RH+1 or RH+2 */
+				l->avl_height = node->avl_height + 1;
+				*nodep = l;
+			} else { /* ll: RH, lr: RH+1 */
+				lrl = lr->avl_left;	/* lrl: RH or RH-1 */
+				lrr = lr->avl_right;	/* lrr: RH or RH-1 */
+				node->avl_left = lrr;	/* lrr: RH or RH-1 */
+				node->avl_right = r;	/* r: RH */
+				node->avl_height = rh + 1; /* node: RH+1 */
+				l->avl_left = ll;	/* ll: RH */
+				l->avl_right = lrl;	/* lrl: RH or RH-1 */
+				l->avl_height = rh + 1;	/* l: RH+1 */
+				lr->avl_left = l;	/* l: RH+1 */
+				lr->avl_right = node;	/* node: RH+1 */
+				lr->avl_height = rh + 2;
+				*nodep = lr;
+			}
+		} else if (rh > lh + 1) { /* r: LH+2 */
+			struct inet_peer *rr, *rl, *rlr, *rll;
+			int rlh;
+			rr = r->avl_right;
+			rl = r->avl_left;
+			rlh = node_height(rl);
+			if (rlh <= node_height(rr)) {	/* rr: LH+1 */
+				node->avl_right = rl;	/* rl: LH or LH+1 */
+				node->avl_left = l;	/* l: LH */
+				node->avl_height = rlh + 1; /* LH+1 or LH+2 */
+				r->avl_right = rr;	/* rr: LH+1 */
+				r->avl_left = node;	/* node: LH+1 or LH+2 */
+				r->avl_height = node->avl_height + 1;
+				*nodep = r;
+			} else { /* rr: RH, rl: RH+1 */
+				rlr = rl->avl_right;	/* rlr: LH or LH-1 */
+				rll = rl->avl_left;	/* rll: LH or LH-1 */
+				node->avl_right = rll;	/* rll: LH or LH-1 */
+				node->avl_left = l;	/* l: LH */
+				node->avl_height = lh + 1; /* node: LH+1 */
+				r->avl_right = rr;	/* rr: LH */
+				r->avl_left = rlr;	/* rlr: LH or LH-1 */
+				r->avl_height = lh + 1;	/* r: LH+1 */
+				rl->avl_right = r;	/* r: LH+1 */
+				rl->avl_left = node;	/* node: LH+1 */
+				rl->avl_height = lh + 2;
+				*nodep = rl;
+			}
+		} else {
+			node->avl_height = (lh > rh ? lh : rh) + 1;
+		}
+	}
+}
+
+/* Called with local BH disabled and the pool write lock held. */
+#define link_to_pool(n)						\
+do {								\
+	n->avl_height = 1;					\
+	n->avl_left = peer_avl_empty;				\
+	n->avl_right = peer_avl_empty;				\
+	**--stackptr = n;					\
+	peer_avl_rebalance(stack, stackptr);			\
+} while(0)
+
+/* May be called with local BH enabled. */
+static void unlink_from_pool(struct inet_peer *p)
+{
+	int do_free;
+
+	do_free = 0;
+
+	write_lock_bh(&peer_pool_lock);
+	/* Check the reference counter.  It was artificially incremented by 1
+	 * in cleanup() function to prevent sudden disappearing.  If the
+	 * reference count is still 1 then the node is referenced only as `p'
+	 * here and from the pool.  So under the exclusive pool lock it's safe
+	 * to remove the node and free it later. */
+	if (atomic_read(&p->refcnt) == 1) {
+		struct inet_peer **stack[PEER_MAXDEPTH];
+		struct inet_peer ***stackptr, ***delp;
+		if (lookup(p->v4daddr) != p)
+			BUG();
+		delp = stackptr - 1; /* *delp[0] == p */
+		if (p->avl_left == peer_avl_empty) {
+			*delp[0] = p->avl_right;
+			--stackptr;
+		} else {
+			/* look for a node to insert instead of p */
+			struct inet_peer *t;
+			t = lookup_rightempty(p);
+			if (*stackptr[-1] != t)
+				BUG();
+			**--stackptr = t->avl_left;
+			/* t is removed, t->v4daddr > x->v4daddr for any
+			 * x in p->avl_left subtree.
+			 * Put t in the old place of p. */
+			*delp[0] = t;
+			t->avl_left = p->avl_left;
+			t->avl_right = p->avl_right;
+			t->avl_height = p->avl_height;
+			if (delp[1] != &p->avl_left)
+				BUG();
+			delp[1] = &t->avl_left; /* was &p->avl_left */
+		}
+		peer_avl_rebalance(stack, stackptr);
+		peer_total--;
+		do_free = 1;
+	}
+	write_unlock_bh(&peer_pool_lock);
+
+	if (do_free)
+		kmem_cache_free(peer_cachep, p);
+	else
+		/* The node is used again.  Decrease the reference counter
+		 * back.  The loop "cleanup -> unlink_from_unused
+		 *   -> unlink_from_pool -> putpeer -> link_to_unused
+		 *   -> cleanup (for the same node)"
+		 * doesn't really exist because the entry will have a
+		 * recent deletion time and will not be cleaned again soon. */
+		inet_putpeer(p);
+}
+
+/* May be called with local BH enabled. */
+static int cleanup_once(unsigned long ttl)
+{
+	struct inet_peer *p;
+
+	/* Remove the first entry from the list of unused nodes. */
+	spin_lock_bh(&inet_peer_unused_lock);
+	p = inet_peer_unused_head;
+	if (p != NULL) {
+		if (time_after(p->dtime + ttl, jiffies)) {
+			/* Do not prune fresh entries. */
+			spin_unlock_bh(&inet_peer_unused_lock);
+			return -1;
+		}
+		inet_peer_unused_head = p->unused_next;
+		if (p->unused_next != NULL)
+			p->unused_next->unused_prevp = p->unused_prevp;
+		else
+			inet_peer_unused_tailp = p->unused_prevp;
+		p->unused_prevp = NULL; /* mark as not on the list */
+		/* Grab an extra reference to prevent node disappearing
+		 * before unlink_from_pool() call. */
+		atomic_inc(&p->refcnt);
+	}
+	spin_unlock_bh(&inet_peer_unused_lock);
+
+	if (p == NULL)
+		/* It means that the total number of USED entries has
+		 * grown over inet_peer_threshold.  It shouldn't really
+		 * happen because of entry limits in route cache. */
+		return -1;
+
+	unlink_from_pool(p);
+	return 0;
+}
+
+/* Called with or without local BH being disabled. */
+struct inet_peer *inet_getpeer(__u32 daddr, int create)
+{
+	struct inet_peer *p, *n;
+	struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr;
+
+	/* Look up for the address quickly. */
+	read_lock_bh(&peer_pool_lock);
+	p = lookup(daddr);
+	if (p != peer_avl_empty)
+		atomic_inc(&p->refcnt);
+	read_unlock_bh(&peer_pool_lock);
+
+	if (p != peer_avl_empty) {
+		/* The existing node has been found. */
+		/* Remove the entry from unused list if it was there. */
+		unlink_from_unused(p);
+		return p;
+	}
+
+	if (!create)
+		return NULL;
+
+	/* Allocate the space outside the locked region. */
+	n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC);
+	if (n == NULL)
+		return NULL;
+	n->v4daddr = daddr;
+	atomic_set(&n->refcnt, 1);
+	n->ip_id_count = secure_ip_id(daddr);
+	n->tcp_ts_stamp = 0;
+
+	write_lock_bh(&peer_pool_lock);
+	/* Check if an entry has suddenly appeared. */
+	p = lookup(daddr);
+	if (p != peer_avl_empty)
+		goto out_free;
+
+	/* Link the node. */
+	link_to_pool(n);
+	n->unused_prevp = NULL; /* not on the list */
+	peer_total++;
+	write_unlock_bh(&peer_pool_lock);
+
+	if (peer_total >= inet_peer_threshold)
+		/* Remove one less-recently-used entry. */
+		cleanup_once(0);
+
+	return n;
+
+out_free:
+	/* The appropriate node is already in the pool. */
+	atomic_inc(&p->refcnt);
+	write_unlock_bh(&peer_pool_lock);
+	/* Remove the entry from unused list if it was there. */
+	unlink_from_unused(p);
+	/* Free preallocated the preallocated node. */
+	kmem_cache_free(peer_cachep, n);
+	return p;
+}
+
+/* Called with local BH disabled. */
+static void peer_check_expire(unsigned long dummy)
+{
+	int i;
+	int ttl;
+
+	if (peer_total >= inet_peer_threshold)
+		ttl = inet_peer_minttl;
+	else
+		ttl = inet_peer_maxttl
+				- (inet_peer_maxttl - inet_peer_minttl) / HZ *
+					peer_total / inet_peer_threshold * HZ;
+	for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++);
+
+	/* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime
+	 * interval depending on the total number of entries (more entries,
+	 * less interval). */
+	peer_periodic_timer.expires = jiffies
+		+ inet_peer_gc_maxtime
+		- (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ *
+			peer_total / inet_peer_threshold * HZ;
+	add_timer(&peer_periodic_timer);
+}
+
+EXPORT_SYMBOL(inet_peer_idlock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
new file mode 100644
index 000000000000..77094aac6c28
--- /dev/null
+++ b/net/ipv4/ip_forward.c
@@ -0,0 +1,127 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP forwarding functionality.
+ *		
+ * Version:	$Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $
+ *
+ * Authors:	see ip.c
+ *
+ * Fixes:
+ *		Many		:	Split from ip.c , see ip_input.c for 
+ *					history.
+ *		Dave Gregorich	:	NULL ip_rt_put fix for multicast 
+ *					routing.
+ *		Jos Vos		:	Add call_out_firewall before sending,
+ *					use output device for accounting.
+ *		Jos Vos		:	Call forward firewall after routing
+ *					(always use output device).
+ *		Mike McLagan	:	Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <linux/route.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+
+static inline int ip_forward_finish(struct sk_buff *skb)
+{
+	struct ip_options * opt	= &(IPCB(skb)->opt);
+
+	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+	if (unlikely(opt->optlen))
+		ip_forward_options(skb);
+
+	return dst_output(skb);
+}
+
+int ip_forward(struct sk_buff *skb)
+{
+	struct iphdr *iph;	/* Our header */
+	struct rtable *rt;	/* Route we use */
+	struct ip_options * opt	= &(IPCB(skb)->opt);
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb))
+		goto drop;
+
+	if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
+		return NET_RX_SUCCESS;
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
+	skb->ip_summed = CHECKSUM_NONE;
+	
+	/*
+	 *	According to the RFC, we must first decrease the TTL field. If
+	 *	that reaches zero, we must reply an ICMP control message telling
+	 *	that the packet's lifetime expired.
+	 */
+
+	iph = skb->nh.iph;
+
+	if (iph->ttl <= 1)
+                goto too_many_hops;
+
+	if (!xfrm4_route_forward(skb))
+		goto drop;
+
+	iph = skb->nh.iph;
+	rt = (struct rtable*)skb->dst;
+
+	if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+		goto sr_failed;
+
+	/* We are about to mangle packet. Copy it! */
+	if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len))
+		goto drop;
+	iph = skb->nh.iph;
+
+	/* Decrease ttl after skb cow done */
+	ip_decrease_ttl(iph);
+
+	/*
+	 *	We now generate an ICMP HOST REDIRECT giving the route
+	 *	we calculated.
+	 */
+	if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr)
+		ip_rt_send_redirect(skb);
+
+	skb->priority = rt_tos2priority(iph->tos);
+
+	return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev,
+		       ip_forward_finish);
+
+sr_failed:
+        /*
+	 *	Strict routing permits no gatewaying
+	 */
+         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
+         goto drop;
+
+too_many_hops:
+        /* Tell the sender its packet died... */
+        icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
new file mode 100644
index 000000000000..7f68e27eb4ea
--- /dev/null
+++ b/net/ipv4/ip_fragment.c
@@ -0,0 +1,691 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP fragmentation functionality.
+ *		
+ * Version:	$Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $
+ *
+ * Authors:	Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox <Alan.Cox@linux.org>
+ *
+ * Fixes:
+ *		Alan Cox	:	Split from ip.c , see ip_input.c for history.
+ *		David S. Miller :	Begin massive cleanup...
+ *		Andi Kleen	:	Add sysctls.
+ *		xxxx		:	Overlapfrag bug.
+ *		Ultima          :       ip_expire() kernel panic.
+ *		Bill Hawes	:	Frag accounting and evictor fixes.
+ *		John McDonald	:	0 length frag bug.
+ *		Alexey Kuznetsov:	SMP races, threading, cleanup.
+ *		Patrick McHardy :	LRU queue of frag heads for evictor.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/jiffies.h>
+#include <linux/skbuff.h>
+#include <linux/list.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/checksum.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+
+/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
+ * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
+ * as well. Or notify me, at least. --ANK
+ */
+
+/* Fragment cache limits. We will commit 256K at one time. Should we
+ * cross that limit we will prune down to 192K. This should cope with
+ * even the most extreme cases without allowing an attacker to measurably
+ * harm machine performance.
+ */
+int sysctl_ipfrag_high_thresh = 256*1024;
+int sysctl_ipfrag_low_thresh = 192*1024;
+
+/* Important NOTE! Fragment queue must be destroyed before MSL expires.
+ * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
+ */
+int sysctl_ipfrag_time = IP_FRAG_TIME;
+
+struct ipfrag_skb_cb
+{
+	struct inet_skb_parm	h;
+	int			offset;
+};
+
+#define FRAG_CB(skb)	((struct ipfrag_skb_cb*)((skb)->cb))
+
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+	struct ipq	*next;		/* linked list pointers			*/
+	struct list_head lru_list;	/* lru list member 			*/
+	u32		user;
+	u32		saddr;
+	u32		daddr;
+	u16		id;
+	u8		protocol;
+	u8		last_in;
+#define COMPLETE		4
+#define FIRST_IN		2
+#define LAST_IN			1
+
+	struct sk_buff	*fragments;	/* linked list of received fragments	*/
+	int		len;		/* total length of original datagram	*/
+	int		meat;
+	spinlock_t	lock;
+	atomic_t	refcnt;
+	struct timer_list timer;	/* when will this queue expire?		*/
+	struct ipq	**pprev;
+	int		iif;
+	struct timeval	stamp;
+};
+
+/* Hash table. */
+
+#define IPQ_HASHSZ	64
+
+/* Per-bucket lock is easy to add now. */
+static struct ipq *ipq_hash[IPQ_HASHSZ];
+static DEFINE_RWLOCK(ipfrag_lock);
+static u32 ipfrag_hash_rnd;
+static LIST_HEAD(ipq_lru_list);
+int ip_frag_nqueues = 0;
+
+static __inline__ void __ipq_unlink(struct ipq *qp)
+{
+	if(qp->next)
+		qp->next->pprev = qp->pprev;
+	*qp->pprev = qp->next;
+	list_del(&qp->lru_list);
+	ip_frag_nqueues--;
+}
+
+static __inline__ void ipq_unlink(struct ipq *ipq)
+{
+	write_lock(&ipfrag_lock);
+	__ipq_unlink(ipq);
+	write_unlock(&ipfrag_lock);
+}
+
+static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot)
+{
+	return jhash_3words((u32)id << 16 | prot, saddr, daddr,
+			    ipfrag_hash_rnd) & (IPQ_HASHSZ - 1);
+}
+
+static struct timer_list ipfrag_secret_timer;
+int sysctl_ipfrag_secret_interval = 10 * 60 * HZ;
+
+static void ipfrag_secret_rebuild(unsigned long dummy)
+{
+	unsigned long now = jiffies;
+	int i;
+
+	write_lock(&ipfrag_lock);
+	get_random_bytes(&ipfrag_hash_rnd, sizeof(u32));
+	for (i = 0; i < IPQ_HASHSZ; i++) {
+		struct ipq *q;
+
+		q = ipq_hash[i];
+		while (q) {
+			struct ipq *next = q->next;
+			unsigned int hval = ipqhashfn(q->id, q->saddr,
+						      q->daddr, q->protocol);
+
+			if (hval != i) {
+				/* Unlink. */
+				if (q->next)
+					q->next->pprev = q->pprev;
+				*q->pprev = q->next;
+
+				/* Relink to new hash chain. */
+				if ((q->next = ipq_hash[hval]) != NULL)
+					q->next->pprev = &q->next;
+				ipq_hash[hval] = q;
+				q->pprev = &ipq_hash[hval];
+			}
+
+			q = next;
+		}
+	}
+	write_unlock(&ipfrag_lock);
+
+	mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval);
+}
+
+atomic_t ip_frag_mem = ATOMIC_INIT(0);	/* Memory used for fragments */
+
+/* Memory Tracking Functions. */
+static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work)
+{
+	if (work)
+		*work -= skb->truesize;
+	atomic_sub(skb->truesize, &ip_frag_mem);
+	kfree_skb(skb);
+}
+
+static __inline__ void frag_free_queue(struct ipq *qp, int *work)
+{
+	if (work)
+		*work -= sizeof(struct ipq);
+	atomic_sub(sizeof(struct ipq), &ip_frag_mem);
+	kfree(qp);
+}
+
+static __inline__ struct ipq *frag_alloc_queue(void)
+{
+	struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC);
+
+	if(!qp)
+		return NULL;
+	atomic_add(sizeof(struct ipq), &ip_frag_mem);
+	return qp;
+}
+
+
+/* Destruction primitives. */
+
+/* Complete destruction of ipq. */
+static void ip_frag_destroy(struct ipq *qp, int *work)
+{
+	struct sk_buff *fp;
+
+	BUG_TRAP(qp->last_in&COMPLETE);
+	BUG_TRAP(del_timer(&qp->timer) == 0);
+
+	/* Release all fragment data. */
+	fp = qp->fragments;
+	while (fp) {
+		struct sk_buff *xp = fp->next;
+
+		frag_kfree_skb(fp, work);
+		fp = xp;
+	}
+
+	/* Finally, release the queue descriptor itself. */
+	frag_free_queue(qp, work);
+}
+
+static __inline__ void ipq_put(struct ipq *ipq, int *work)
+{
+	if (atomic_dec_and_test(&ipq->refcnt))
+		ip_frag_destroy(ipq, work);
+}
+
+/* Kill ipq entry. It is not destroyed immediately,
+ * because caller (and someone more) holds reference count.
+ */
+static void ipq_kill(struct ipq *ipq)
+{
+	if (del_timer(&ipq->timer))
+		atomic_dec(&ipq->refcnt);
+
+	if (!(ipq->last_in & COMPLETE)) {
+		ipq_unlink(ipq);
+		atomic_dec(&ipq->refcnt);
+		ipq->last_in |= COMPLETE;
+	}
+}
+
+/* Memory limiting on fragments.  Evictor trashes the oldest 
+ * fragment queue until we are back under the threshold.
+ */
+static void ip_evictor(void)
+{
+	struct ipq *qp;
+	struct list_head *tmp;
+	int work;
+
+	work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh;
+	if (work <= 0)
+		return;
+
+	while (work > 0) {
+		read_lock(&ipfrag_lock);
+		if (list_empty(&ipq_lru_list)) {
+			read_unlock(&ipfrag_lock);
+			return;
+		}
+		tmp = ipq_lru_list.next;
+		qp = list_entry(tmp, struct ipq, lru_list);
+		atomic_inc(&qp->refcnt);
+		read_unlock(&ipfrag_lock);
+
+		spin_lock(&qp->lock);
+		if (!(qp->last_in&COMPLETE))
+			ipq_kill(qp);
+		spin_unlock(&qp->lock);
+
+		ipq_put(qp, &work);
+		IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+	}
+}
+
+/*
+ * Oops, a fragment queue timed out.  Kill it and send an ICMP reply.
+ */
+static void ip_expire(unsigned long arg)
+{
+	struct ipq *qp = (struct ipq *) arg;
+
+	spin_lock(&qp->lock);
+
+	if (qp->last_in & COMPLETE)
+		goto out;
+
+	ipq_kill(qp);
+
+	IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT);
+	IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+
+	if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) {
+		struct sk_buff *head = qp->fragments;
+		/* Send an ICMP "Fragment Reassembly Timeout" message. */
+		if ((head->dev = dev_get_by_index(qp->iif)) != NULL) {
+			icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+			dev_put(head->dev);
+		}
+	}
+out:
+	spin_unlock(&qp->lock);
+	ipq_put(qp, NULL);
+}
+
+/* Creation primitives. */
+
+static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in)
+{
+	struct ipq *qp;
+
+	write_lock(&ipfrag_lock);
+#ifdef CONFIG_SMP
+	/* With SMP race we have to recheck hash table, because
+	 * such entry could be created on other cpu, while we
+	 * promoted read lock to write lock.
+	 */
+	for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+		if(qp->id == qp_in->id		&&
+		   qp->saddr == qp_in->saddr	&&
+		   qp->daddr == qp_in->daddr	&&
+		   qp->protocol == qp_in->protocol &&
+		   qp->user == qp_in->user) {
+			atomic_inc(&qp->refcnt);
+			write_unlock(&ipfrag_lock);
+			qp_in->last_in |= COMPLETE;
+			ipq_put(qp_in, NULL);
+			return qp;
+		}
+	}
+#endif
+	qp = qp_in;
+
+	if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time))
+		atomic_inc(&qp->refcnt);
+
+	atomic_inc(&qp->refcnt);
+	if((qp->next = ipq_hash[hash]) != NULL)
+		qp->next->pprev = &qp->next;
+	ipq_hash[hash] = qp;
+	qp->pprev = &ipq_hash[hash];
+	INIT_LIST_HEAD(&qp->lru_list);
+	list_add_tail(&qp->lru_list, &ipq_lru_list);
+	ip_frag_nqueues++;
+	write_unlock(&ipfrag_lock);
+	return qp;
+}
+
+/* Add an entry to the 'ipq' queue for a newly received IP datagram. */
+static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
+{
+	struct ipq *qp;
+
+	if ((qp = frag_alloc_queue()) == NULL)
+		goto out_nomem;
+
+	qp->protocol = iph->protocol;
+	qp->last_in = 0;
+	qp->id = iph->id;
+	qp->saddr = iph->saddr;
+	qp->daddr = iph->daddr;
+	qp->user = user;
+	qp->len = 0;
+	qp->meat = 0;
+	qp->fragments = NULL;
+	qp->iif = 0;
+
+	/* Initialize a timer for this entry. */
+	init_timer(&qp->timer);
+	qp->timer.data = (unsigned long) qp;	/* pointer to queue	*/
+	qp->timer.function = ip_expire;		/* expire function	*/
+	spin_lock_init(&qp->lock);
+	atomic_set(&qp->refcnt, 1);
+
+	return ip_frag_intern(hash, qp);
+
+out_nomem:
+	NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n"));
+	return NULL;
+}
+
+/* Find the correct entry in the "incomplete datagrams" queue for
+ * this IP datagram, and create new one, if nothing is found.
+ */
+static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
+{
+	__u16 id = iph->id;
+	__u32 saddr = iph->saddr;
+	__u32 daddr = iph->daddr;
+	__u8 protocol = iph->protocol;
+	unsigned int hash = ipqhashfn(id, saddr, daddr, protocol);
+	struct ipq *qp;
+
+	read_lock(&ipfrag_lock);
+	for(qp = ipq_hash[hash]; qp; qp = qp->next) {
+		if(qp->id == id		&&
+		   qp->saddr == saddr	&&
+		   qp->daddr == daddr	&&
+		   qp->protocol == protocol &&
+		   qp->user == user) {
+			atomic_inc(&qp->refcnt);
+			read_unlock(&ipfrag_lock);
+			return qp;
+		}
+	}
+	read_unlock(&ipfrag_lock);
+
+	return ip_frag_create(hash, iph, user);
+}
+
+/* Add new segment to existing queue. */
+static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+{
+	struct sk_buff *prev, *next;
+	int flags, offset;
+	int ihl, end;
+
+	if (qp->last_in & COMPLETE)
+		goto err;
+
+ 	offset = ntohs(skb->nh.iph->frag_off);
+	flags = offset & ~IP_OFFSET;
+	offset &= IP_OFFSET;
+	offset <<= 3;		/* offset is in 8-byte chunks */
+ 	ihl = skb->nh.iph->ihl * 4;
+
+	/* Determine the position of this fragment. */
+ 	end = offset + skb->len - ihl;
+
+	/* Is this the final fragment? */
+	if ((flags & IP_MF) == 0) {
+		/* If we already have some bits beyond end
+		 * or have different end, the segment is corrrupted.
+		 */
+		if (end < qp->len ||
+		    ((qp->last_in & LAST_IN) && end != qp->len))
+			goto err;
+		qp->last_in |= LAST_IN;
+		qp->len = end;
+	} else {
+		if (end&7) {
+			end &= ~7;
+			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+				skb->ip_summed = CHECKSUM_NONE;
+		}
+		if (end > qp->len) {
+			/* Some bits beyond end -> corruption. */
+			if (qp->last_in & LAST_IN)
+				goto err;
+			qp->len = end;
+		}
+	}
+	if (end == offset)
+		goto err;
+
+	if (pskb_pull(skb, ihl) == NULL)
+		goto err;
+	if (pskb_trim(skb, end-offset))
+		goto err;
+
+	/* Find out which fragments are in front and at the back of us
+	 * in the chain of fragments so far.  We must know where to put
+	 * this fragment, right?
+	 */
+	prev = NULL;
+	for(next = qp->fragments; next != NULL; next = next->next) {
+		if (FRAG_CB(next)->offset >= offset)
+			break;	/* bingo! */
+		prev = next;
+	}
+
+	/* We found where to put this one.  Check for overlap with
+	 * preceding fragment, and, if needed, align things so that
+	 * any overlaps are eliminated.
+	 */
+	if (prev) {
+		int i = (FRAG_CB(prev)->offset + prev->len) - offset;
+
+		if (i > 0) {
+			offset += i;
+			if (end <= offset)
+				goto err;
+			if (!pskb_pull(skb, i))
+				goto err;
+			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+				skb->ip_summed = CHECKSUM_NONE;
+		}
+	}
+
+	while (next && FRAG_CB(next)->offset < end) {
+		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+
+		if (i < next->len) {
+			/* Eat head of the next overlapped fragment
+			 * and leave the loop. The next ones cannot overlap.
+			 */
+			if (!pskb_pull(next, i))
+				goto err;
+			FRAG_CB(next)->offset += i;
+			qp->meat -= i;
+			if (next->ip_summed != CHECKSUM_UNNECESSARY)
+				next->ip_summed = CHECKSUM_NONE;
+			break;
+		} else {
+			struct sk_buff *free_it = next;
+
+			/* Old fragmnet is completely overridden with
+			 * new one drop it.
+			 */
+			next = next->next;
+
+			if (prev)
+				prev->next = next;
+			else
+				qp->fragments = next;
+
+			qp->meat -= free_it->len;
+			frag_kfree_skb(free_it, NULL);
+		}
+	}
+
+	FRAG_CB(skb)->offset = offset;
+
+	/* Insert this fragment in the chain of fragments. */
+	skb->next = next;
+	if (prev)
+		prev->next = skb;
+	else
+		qp->fragments = skb;
+
+ 	if (skb->dev)
+ 		qp->iif = skb->dev->ifindex;
+	skb->dev = NULL;
+	qp->stamp = skb->stamp;
+	qp->meat += skb->len;
+	atomic_add(skb->truesize, &ip_frag_mem);
+	if (offset == 0)
+		qp->last_in |= FIRST_IN;
+
+	write_lock(&ipfrag_lock);
+	list_move_tail(&qp->lru_list, &ipq_lru_list);
+	write_unlock(&ipfrag_lock);
+
+	return;
+
+err:
+	kfree_skb(skb);
+}
+
+
+/* Build a new IP datagram from all its fragments. */
+
+static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev)
+{
+	struct iphdr *iph;
+	struct sk_buff *fp, *head = qp->fragments;
+	int len;
+	int ihlen;
+
+	ipq_kill(qp);
+
+	BUG_TRAP(head != NULL);
+	BUG_TRAP(FRAG_CB(head)->offset == 0);
+
+	/* Allocate a new buffer for the datagram. */
+	ihlen = head->nh.iph->ihl*4;
+	len = ihlen + qp->len;
+
+	if(len > 65535)
+		goto out_oversize;
+
+	/* Head of list must not be cloned. */
+	if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC))
+		goto out_nomem;
+
+	/* If the first fragment is fragmented itself, we split
+	 * it to two chunks: the first with data and paged part
+	 * and the second, holding only fragments. */
+	if (skb_shinfo(head)->frag_list) {
+		struct sk_buff *clone;
+		int i, plen = 0;
+
+		if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL)
+			goto out_nomem;
+		clone->next = head->next;
+		head->next = clone;
+		skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
+		skb_shinfo(head)->frag_list = NULL;
+		for (i=0; i<skb_shinfo(head)->nr_frags; i++)
+			plen += skb_shinfo(head)->frags[i].size;
+		clone->len = clone->data_len = head->data_len - plen;
+		head->data_len -= clone->len;
+		head->len -= clone->len;
+		clone->csum = 0;
+		clone->ip_summed = head->ip_summed;
+		atomic_add(clone->truesize, &ip_frag_mem);
+	}
+
+	skb_shinfo(head)->frag_list = head->next;
+	skb_push(head, head->data - head->nh.raw);
+	atomic_sub(head->truesize, &ip_frag_mem);
+
+	for (fp=head->next; fp; fp = fp->next) {
+		head->data_len += fp->len;
+		head->len += fp->len;
+		if (head->ip_summed != fp->ip_summed)
+			head->ip_summed = CHECKSUM_NONE;
+		else if (head->ip_summed == CHECKSUM_HW)
+			head->csum = csum_add(head->csum, fp->csum);
+		head->truesize += fp->truesize;
+		atomic_sub(fp->truesize, &ip_frag_mem);
+	}
+
+	head->next = NULL;
+	head->dev = dev;
+	head->stamp = qp->stamp;
+
+	iph = head->nh.iph;
+	iph->frag_off = 0;
+	iph->tot_len = htons(len);
+	IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
+	qp->fragments = NULL;
+	return head;
+
+out_nomem:
+ 	NETDEBUG(if (net_ratelimit())
+	         printk(KERN_ERR 
+			"IP: queue_glue: no memory for gluing queue %p\n",
+			qp));
+	goto out_fail;
+out_oversize:
+	if (net_ratelimit())
+		printk(KERN_INFO
+			"Oversized IP packet from %d.%d.%d.%d.\n",
+			NIPQUAD(qp->saddr));
+out_fail:
+	IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+	return NULL;
+}
+
+/* Process an incoming IP datagram fragment. */
+struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct ipq *qp;
+	struct net_device *dev;
+	
+	IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS);
+
+	/* Start by cleaning up the memory. */
+	if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh)
+		ip_evictor();
+
+	dev = skb->dev;
+
+	/* Lookup (or create) queue header */
+	if ((qp = ip_find(iph, user)) != NULL) {
+		struct sk_buff *ret = NULL;
+
+		spin_lock(&qp->lock);
+
+		ip_frag_queue(qp, skb);
+
+		if (qp->last_in == (FIRST_IN|LAST_IN) &&
+		    qp->meat == qp->len)
+			ret = ip_frag_reasm(qp, dev);
+
+		spin_unlock(&qp->lock);
+		ipq_put(qp, NULL);
+		return ret;
+	}
+
+	IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
+	kfree_skb(skb);
+	return NULL;
+}
+
+void ipfrag_init(void)
+{
+	ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^
+				 (jiffies ^ (jiffies >> 6)));
+
+	init_timer(&ipfrag_secret_timer);
+	ipfrag_secret_timer.function = ipfrag_secret_rebuild;
+	ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval;
+	add_timer(&ipfrag_secret_timer);
+}
+
+EXPORT_SYMBOL(ip_defrag);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
new file mode 100644
index 000000000000..884835522224
--- /dev/null
+++ b/net/ipv4/ip_gre.c
@@ -0,0 +1,1290 @@
+/*
+ *	Linux NET3:	GRE over IP protocol decoder. 
+ *
+ *	Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/arp.h>
+#include <net/checksum.h>
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#ifdef CONFIG_IPV6
+#include <net/ipv6.h>
+#include <net/ip6_fib.h>
+#include <net/ip6_route.h>
+#endif
+
+/*
+   Problems & solutions
+   --------------------
+
+   1. The most important issue is detecting local dead loops.
+   They would cause complete host lockup in transmit, which
+   would be "resolved" by stack overflow or, if queueing is enabled,
+   with infinite looping in net_bh.
+
+   We cannot track such dead loops during route installation,
+   it is infeasible task. The most general solutions would be
+   to keep skb->encapsulation counter (sort of local ttl),
+   and silently drop packet when it expires. It is the best
+   solution, but it supposes maintaing new variable in ALL
+   skb, even if no tunneling is used.
+
+   Current solution: t->recursion lock breaks dead loops. It looks 
+   like dev->tbusy flag, but I preferred new variable, because
+   the semantics is different. One day, when hard_start_xmit
+   will be multithreaded we will have to use skb->encapsulation.
+
+
+
+   2. Networking dead loops would not kill routers, but would really
+   kill network. IP hop limit plays role of "t->recursion" in this case,
+   if we copy it from packet being encapsulated to upper header.
+   It is very good solution, but it introduces two problems:
+
+   - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
+     do not work over tunnels.
+   - traceroute does not work. I planned to relay ICMP from tunnel,
+     so that this problem would be solved and traceroute output
+     would even more informative. This idea appeared to be wrong:
+     only Linux complies to rfc1812 now (yes, guys, Linux is the only
+     true router now :-)), all routers (at least, in neighbourhood of mine)
+     return only 8 bytes of payload. It is the end.
+
+   Hence, if we want that OSPF worked or traceroute said something reasonable,
+   we should search for another solution.
+
+   One of them is to parse packet trying to detect inner encapsulation
+   made by our node. It is difficult or even impossible, especially,
+   taking into account fragmentation. TO be short, tt is not solution at all.
+
+   Current solution: The solution was UNEXPECTEDLY SIMPLE.
+   We force DF flag on tunnels with preconfigured hop limit,
+   that is ALL. :-) Well, it does not remove the problem completely,
+   but exponential growth of network traffic is changed to linear
+   (branches, that exceed pmtu are pruned) and tunnel mtu
+   fastly degrades to value <68, where looping stops.
+   Yes, it is not good if there exists a router in the loop,
+   which does not force DF, even when encapsulating packets have DF set.
+   But it is not our problem! Nobody could accuse us, we made
+   all that we could make. Even if it is your gated who injected
+   fatal route to network, even if it were you who configured
+   fatal static route: you are innocent. :-)
+
+
+
+   3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
+   practically identical code. It would be good to glue them
+   together, but it is not very evident, how to make them modular.
+   sit is integral part of IPv6, ipip and gre are naturally modular.
+   We could extract common parts (hash table, ioctl etc)
+   to a separate module (ip_tunnel.c).
+
+   Alexey Kuznetsov.
+ */
+
+static int ipgre_tunnel_init(struct net_device *dev);
+static void ipgre_tunnel_setup(struct net_device *dev);
+
+/* Fallback tunnel: no source, no destination, no key, no options */
+
+static int ipgre_fb_tunnel_init(struct net_device *dev);
+
+static struct net_device *ipgre_fb_tunnel_dev;
+
+/* Tunnel hash table */
+
+/*
+   4 hash tables:
+
+   3: (remote,local)
+   2: (remote,*)
+   1: (*,local)
+   0: (*,*)
+
+   We require exact key match i.e. if a key is present in packet
+   it will match only tunnel with the same key; if it is not present,
+   it will match only keyless tunnel.
+
+   All keysless packets, if not matched configured keyless tunnels
+   will match fallback tunnel.
+ */
+
+#define HASH_SIZE  16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static struct ip_tunnel *tunnels[4][HASH_SIZE];
+
+#define tunnels_r_l	(tunnels[3])
+#define tunnels_r	(tunnels[2])
+#define tunnels_l	(tunnels[1])
+#define tunnels_wc	(tunnels[0])
+
+static DEFINE_RWLOCK(ipgre_lock);
+
+/* Given src, dst and key, find appropriate for input tunnel. */
+
+static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key)
+{
+	unsigned h0 = HASH(remote);
+	unsigned h1 = HASH(key);
+	struct ip_tunnel *t;
+
+	for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+				return t;
+		}
+	}
+	for (t = tunnels_r[h0^h1]; t; t = t->next) {
+		if (remote == t->parms.iph.daddr) {
+			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+				return t;
+		}
+	}
+	for (t = tunnels_l[h1]; t; t = t->next) {
+		if (local == t->parms.iph.saddr ||
+		     (local == t->parms.iph.daddr && MULTICAST(local))) {
+			if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+				return t;
+		}
+	}
+	for (t = tunnels_wc[h1]; t; t = t->next) {
+		if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
+			return t;
+	}
+
+	if (ipgre_fb_tunnel_dev->flags&IFF_UP)
+		return ipgre_fb_tunnel_dev->priv;
+	return NULL;
+}
+
+static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t)
+{
+	u32 remote = t->parms.iph.daddr;
+	u32 local = t->parms.iph.saddr;
+	u32 key = t->parms.i_key;
+	unsigned h = HASH(key);
+	int prio = 0;
+
+	if (local)
+		prio |= 1;
+	if (remote && !MULTICAST(remote)) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+
+	return &tunnels[prio][h];
+}
+
+static void ipgre_tunnel_link(struct ip_tunnel *t)
+{
+	struct ip_tunnel **tp = ipgre_bucket(t);
+
+	t->next = *tp;
+	write_lock_bh(&ipgre_lock);
+	*tp = t;
+	write_unlock_bh(&ipgre_lock);
+}
+
+static void ipgre_tunnel_unlink(struct ip_tunnel *t)
+{
+	struct ip_tunnel **tp;
+
+	for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) {
+		if (t == *tp) {
+			write_lock_bh(&ipgre_lock);
+			*tp = t->next;
+			write_unlock_bh(&ipgre_lock);
+			break;
+		}
+	}
+}
+
+static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+	u32 remote = parms->iph.daddr;
+	u32 local = parms->iph.saddr;
+	u32 key = parms->i_key;
+	struct ip_tunnel *t, **tp, *nt;
+	struct net_device *dev;
+	unsigned h = HASH(key);
+	int prio = 0;
+	char name[IFNAMSIZ];
+
+	if (local)
+		prio |= 1;
+	if (remote && !MULTICAST(remote)) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+	for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
+			if (key == t->parms.i_key)
+				return t;
+		}
+	}
+	if (!create)
+		return NULL;
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else {
+		int i;
+		for (i=1; i<100; i++) {
+			sprintf(name, "gre%d", i);
+			if (__dev_get_by_name(name) == NULL)
+				break;
+		}
+		if (i==100)
+			goto failed;
+	}
+
+	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
+	if (!dev)
+	  return NULL;
+
+	dev->init = ipgre_tunnel_init;
+	nt = dev->priv;
+	nt->parms = *parms;
+
+	if (register_netdevice(dev) < 0) {
+		free_netdev(dev);
+		goto failed;
+	}
+
+	nt = dev->priv;
+	nt->parms = *parms;
+
+	dev_hold(dev);
+	ipgre_tunnel_link(nt);
+	/* Do not decrement MOD_USE_COUNT here. */
+	return nt;
+
+failed:
+	return NULL;
+}
+
+static void ipgre_tunnel_uninit(struct net_device *dev)
+{
+	ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv);
+	dev_put(dev);
+}
+
+
+static void ipgre_err(struct sk_buff *skb, u32 info)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+
+   Moreover, Cisco "wise men" put GRE key to the third word
+   in GRE header. It makes impossible maintaining even soft state for keyed
+   GRE tunnels with enabled checksum. Tell them "thank you".
+
+   Well, I wonder, rfc1812 was written by Cisco employee,
+   what the hell these idiots break standrads established
+   by themself???
+ */
+
+	struct iphdr *iph = (struct iphdr*)skb->data;
+	u16	     *p = (u16*)(skb->data+(iph->ihl<<2));
+	int grehlen = (iph->ihl<<2) + 4;
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	struct ip_tunnel *t;
+	u16 flags;
+
+	flags = p[0];
+	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+		if (flags&(GRE_VERSION|GRE_ROUTING))
+			return;
+		if (flags&GRE_KEY) {
+			grehlen += 4;
+			if (flags&GRE_CSUM)
+				grehlen += 4;
+		}
+	}
+
+	/* If only 8 bytes returned, keyed message will be dropped here */
+	if (skb_headlen(skb) < grehlen)
+		return;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return;
+		case ICMP_FRAG_NEEDED:
+			/* Soft state for pmtu is maintained by IP core. */
+			return;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return;
+		break;
+	}
+
+	read_lock(&ipgre_lock);
+	t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0);
+	if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr))
+		goto out;
+
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+out:
+	read_unlock(&ipgre_lock);
+	return;
+#else
+	struct iphdr *iph = (struct iphdr*)dp;
+	struct iphdr *eiph;
+	u16	     *p = (u16*)(dp+(iph->ihl<<2));
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	int rel_type = 0;
+	int rel_code = 0;
+	int rel_info = 0;
+	u16 flags;
+	int grehlen = (iph->ihl<<2) + 4;
+	struct sk_buff *skb2;
+	struct flowi fl;
+	struct rtable *rt;
+
+	if (p[1] != htons(ETH_P_IP))
+		return;
+
+	flags = p[0];
+	if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
+		if (flags&(GRE_VERSION|GRE_ROUTING))
+			return;
+		if (flags&GRE_CSUM)
+			grehlen += 4;
+		if (flags&GRE_KEY)
+			grehlen += 4;
+		if (flags&GRE_SEQ)
+			grehlen += 4;
+	}
+	if (len < grehlen + sizeof(struct iphdr))
+		return;
+	eiph = (struct iphdr*)(dp + grehlen);
+
+	switch (type) {
+	default:
+		return;
+	case ICMP_PARAMETERPROB:
+		if (skb->h.icmph->un.gateway < (iph->ihl<<2))
+			return;
+
+		/* So... This guy found something strange INSIDE encapsulated
+		   packet. Well, he is fool, but what can we do ?
+		 */
+		rel_type = ICMP_PARAMETERPROB;
+		rel_info = skb->h.icmph->un.gateway - grehlen;
+		break;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return;
+		case ICMP_FRAG_NEEDED:
+			/* And it is the only really necessary thing :-) */
+			rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+			if (rel_info < grehlen+68)
+				return;
+			rel_info -= grehlen;
+			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+			if (rel_info > ntohs(eiph->tot_len))
+				return;
+			break;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe, it is just ether pollution. --ANK
+			 */
+			rel_type = ICMP_DEST_UNREACH;
+			rel_code = ICMP_HOST_UNREACH;
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return;
+		break;
+	}
+
+	/* Prepare fake skb to feed it to icmp_send */
+	skb2 = skb_clone(skb, GFP_ATOMIC);
+	if (skb2 == NULL)
+		return;
+	dst_release(skb2->dst);
+	skb2->dst = NULL;
+	skb_pull(skb2, skb->data - (u8*)eiph);
+	skb2->nh.raw = skb2->data;
+
+	/* Try to guess incoming interface */
+	memset(&fl, 0, sizeof(fl));
+	fl.fl4_dst = eiph->saddr;
+	fl.fl4_tos = RT_TOS(eiph->tos);
+	fl.proto = IPPROTO_GRE;
+	if (ip_route_output_key(&rt, &fl)) {
+		kfree_skb(skb2);
+		return;
+	}
+	skb2->dev = rt->u.dst.dev;
+
+	/* route "incoming" packet */
+	if (rt->rt_flags&RTCF_LOCAL) {
+		ip_rt_put(rt);
+		rt = NULL;
+		fl.fl4_dst = eiph->daddr;
+		fl.fl4_src = eiph->saddr;
+		fl.fl4_tos = eiph->tos;
+		if (ip_route_output_key(&rt, &fl) ||
+		    rt->u.dst.dev->type != ARPHRD_IPGRE) {
+			ip_rt_put(rt);
+			kfree_skb(skb2);
+			return;
+		}
+	} else {
+		ip_rt_put(rt);
+		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+		    skb2->dst->dev->type != ARPHRD_IPGRE) {
+			kfree_skb(skb2);
+			return;
+		}
+	}
+
+	/* change mtu on this route */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		if (rel_info > dst_mtu(skb2->dst)) {
+			kfree_skb(skb2);
+			return;
+		}
+		skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+		rel_info = htonl(rel_info);
+	} else if (type == ICMP_TIME_EXCEEDED) {
+		struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+		if (t->parms.iph.ttl) {
+			rel_type = ICMP_DEST_UNREACH;
+			rel_code = ICMP_HOST_UNREACH;
+		}
+	}
+
+	icmp_send(skb2, rel_type, rel_code, rel_info);
+	kfree_skb(skb2);
+#endif
+}
+
+static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
+{
+	if (INET_ECN_is_ce(iph->tos)) {
+		if (skb->protocol == htons(ETH_P_IP)) {
+			IP_ECN_set_ce(skb->nh.iph);
+		} else if (skb->protocol == htons(ETH_P_IPV6)) {
+			IP6_ECN_set_ce(skb->nh.ipv6h);
+		}
+	}
+}
+
+static inline u8
+ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
+{
+	u8 inner = 0;
+	if (skb->protocol == htons(ETH_P_IP))
+		inner = old_iph->tos;
+	else if (skb->protocol == htons(ETH_P_IPV6))
+		inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
+	return INET_ECN_encapsulate(tos, inner);
+}
+
+static int ipgre_rcv(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	u8     *h;
+	u16    flags;
+	u16    csum = 0;
+	u32    key = 0;
+	u32    seqno = 0;
+	struct ip_tunnel *tunnel;
+	int    offset = 4;
+
+	if (!pskb_may_pull(skb, 16))
+		goto drop_nolock;
+
+	iph = skb->nh.iph;
+	h = skb->data;
+	flags = *(u16*)h;
+
+	if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
+		/* - Version must be 0.
+		   - We do not support routing headers.
+		 */
+		if (flags&(GRE_VERSION|GRE_ROUTING))
+			goto drop_nolock;
+
+		if (flags&GRE_CSUM) {
+			if (skb->ip_summed == CHECKSUM_HW) {
+				csum = (u16)csum_fold(skb->csum);
+				if (csum)
+					skb->ip_summed = CHECKSUM_NONE;
+			}
+			if (skb->ip_summed == CHECKSUM_NONE) {
+				skb->csum = skb_checksum(skb, 0, skb->len, 0);
+				skb->ip_summed = CHECKSUM_HW;
+				csum = (u16)csum_fold(skb->csum);
+			}
+			offset += 4;
+		}
+		if (flags&GRE_KEY) {
+			key = *(u32*)(h + offset);
+			offset += 4;
+		}
+		if (flags&GRE_SEQ) {
+			seqno = ntohl(*(u32*)(h + offset));
+			offset += 4;
+		}
+	}
+
+	read_lock(&ipgre_lock);
+	if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) {
+		secpath_reset(skb);
+
+		skb->protocol = *(u16*)(h + 2);
+		/* WCCP version 1 and 2 protocol decoding.
+		 * - Change protocol to IP
+		 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
+		 */
+		if (flags == 0 &&
+		    skb->protocol == __constant_htons(ETH_P_WCCP)) {
+			skb->protocol = __constant_htons(ETH_P_IP);
+			if ((*(h + offset) & 0xF0) != 0x40) 
+				offset += 4;
+		}
+
+		skb->mac.raw = skb->nh.raw;
+		skb->nh.raw = __pskb_pull(skb, offset);
+		skb_postpull_rcsum(skb, skb->mac.raw, offset);
+		memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+		skb->pkt_type = PACKET_HOST;
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+		if (MULTICAST(iph->daddr)) {
+			/* Looped back packet, drop it! */
+			if (((struct rtable*)skb->dst)->fl.iif == 0)
+				goto drop;
+			tunnel->stat.multicast++;
+			skb->pkt_type = PACKET_BROADCAST;
+		}
+#endif
+
+		if (((flags&GRE_CSUM) && csum) ||
+		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
+			tunnel->stat.rx_crc_errors++;
+			tunnel->stat.rx_errors++;
+			goto drop;
+		}
+		if (tunnel->parms.i_flags&GRE_SEQ) {
+			if (!(flags&GRE_SEQ) ||
+			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
+				tunnel->stat.rx_fifo_errors++;
+				tunnel->stat.rx_errors++;
+				goto drop;
+			}
+			tunnel->i_seqno = seqno + 1;
+		}
+		tunnel->stat.rx_packets++;
+		tunnel->stat.rx_bytes += skb->len;
+		skb->dev = tunnel->dev;
+		dst_release(skb->dst);
+		skb->dst = NULL;
+		nf_reset(skb);
+		ipgre_ecn_decapsulate(iph, skb);
+		netif_rx(skb);
+		read_unlock(&ipgre_lock);
+		return(0);
+	}
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0);
+
+drop:
+	read_unlock(&ipgre_lock);
+drop_nolock:
+	kfree_skb(skb);
+	return(0);
+}
+
+static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+	struct net_device_stats *stats = &tunnel->stat;
+	struct iphdr  *old_iph = skb->nh.iph;
+	struct iphdr  *tiph;
+	u8     tos;
+	u16    df;
+	struct rtable *rt;     			/* Route to the other host */
+	struct net_device *tdev;			/* Device to other host */
+	struct iphdr  *iph;			/* Our new IP header */
+	int    max_headroom;			/* The extra header space needed */
+	int    gre_hlen;
+	u32    dst;
+	int    mtu;
+
+	if (tunnel->recursion++) {
+		tunnel->stat.collisions++;
+		goto tx_error;
+	}
+
+	if (dev->hard_header) {
+		gre_hlen = 0;
+		tiph = (struct iphdr*)skb->data;
+	} else {
+		gre_hlen = tunnel->hlen;
+		tiph = &tunnel->parms.iph;
+	}
+
+	if ((dst = tiph->daddr) == 0) {
+		/* NBMA tunnel */
+
+		if (skb->dst == NULL) {
+			tunnel->stat.tx_fifo_errors++;
+			goto tx_error;
+		}
+
+		if (skb->protocol == htons(ETH_P_IP)) {
+			rt = (struct rtable*)skb->dst;
+			if ((dst = rt->rt_gateway) == 0)
+				goto tx_error_icmp;
+		}
+#ifdef CONFIG_IPV6
+		else if (skb->protocol == htons(ETH_P_IPV6)) {
+			struct in6_addr *addr6;
+			int addr_type;
+			struct neighbour *neigh = skb->dst->neighbour;
+
+			if (neigh == NULL)
+				goto tx_error;
+
+			addr6 = (struct in6_addr*)&neigh->primary_key;
+			addr_type = ipv6_addr_type(addr6);
+
+			if (addr_type == IPV6_ADDR_ANY) {
+				addr6 = &skb->nh.ipv6h->daddr;
+				addr_type = ipv6_addr_type(addr6);
+			}
+
+			if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
+				goto tx_error_icmp;
+
+			dst = addr6->s6_addr32[3];
+		}
+#endif
+		else
+			goto tx_error;
+	}
+
+	tos = tiph->tos;
+	if (tos&1) {
+		if (skb->protocol == htons(ETH_P_IP))
+			tos = old_iph->tos;
+		tos &= ~1;
+	}
+
+	{
+		struct flowi fl = { .oif = tunnel->parms.link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = dst,
+						.saddr = tiph->saddr,
+						.tos = RT_TOS(tos) } },
+				    .proto = IPPROTO_GRE };
+		if (ip_route_output_key(&rt, &fl)) {
+			tunnel->stat.tx_carrier_errors++;
+			goto tx_error;
+		}
+	}
+	tdev = rt->u.dst.dev;
+
+	if (tdev == dev) {
+		ip_rt_put(rt);
+		tunnel->stat.collisions++;
+		goto tx_error;
+	}
+
+	df = tiph->frag_off;
+	if (df)
+		mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
+	else
+		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+
+	if (skb->dst)
+		skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+	if (skb->protocol == htons(ETH_P_IP)) {
+		df |= (old_iph->frag_off&htons(IP_DF));
+
+		if ((old_iph->frag_off&htons(IP_DF)) &&
+		    mtu < ntohs(old_iph->tot_len)) {
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#ifdef CONFIG_IPV6
+	else if (skb->protocol == htons(ETH_P_IPV6)) {
+		struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
+
+		if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
+			if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) ||
+			    rt6->rt6i_dst.plen == 128) {
+				rt6->rt6i_flags |= RTF_MODIFIED;
+				skb->dst->metrics[RTAX_MTU-1] = mtu;
+			}
+		}
+
+		if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+	}
+#endif
+
+	if (tunnel->err_count > 0) {
+		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+			tunnel->err_count--;
+
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+
+	if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+  			stats->tx_dropped++;
+			dev_kfree_skb(skb);
+			tunnel->recursion--;
+			return 0;
+		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		dev_kfree_skb(skb);
+		skb = new_skb;
+		old_iph = skb->nh.iph;
+	}
+
+	skb->h.raw = skb->nh.raw;
+	skb->nh.raw = skb_push(skb, gre_hlen);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+
+	iph 			=	skb->nh.iph;
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr) >> 2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_GRE;
+	iph->tos		=	ipgre_ecn_encapsulate(tos, old_iph, skb);
+	iph->daddr		=	rt->rt_dst;
+	iph->saddr		=	rt->rt_src;
+
+	if ((iph->ttl = tiph->ttl) == 0) {
+		if (skb->protocol == htons(ETH_P_IP))
+			iph->ttl = old_iph->ttl;
+#ifdef CONFIG_IPV6
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
+#endif
+		else
+			iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
+	}
+
+	((u16*)(iph+1))[0] = tunnel->parms.o_flags;
+	((u16*)(iph+1))[1] = skb->protocol;
+
+	if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
+		u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4);
+
+		if (tunnel->parms.o_flags&GRE_SEQ) {
+			++tunnel->o_seqno;
+			*ptr = htonl(tunnel->o_seqno);
+			ptr--;
+		}
+		if (tunnel->parms.o_flags&GRE_KEY) {
+			*ptr = tunnel->parms.o_key;
+			ptr--;
+		}
+		if (tunnel->parms.o_flags&GRE_CSUM) {
+			*ptr = 0;
+			*(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
+		}
+	}
+
+	nf_reset(skb);
+
+	IPTUNNEL_XMIT();
+	tunnel->recursion--;
+	return 0;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+
+tx_error:
+	stats->tx_errors++;
+	dev_kfree_skb(skb);
+	tunnel->recursion--;
+	return 0;
+}
+
+static int
+ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel_parm p;
+	struct ip_tunnel *t;
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		t = NULL;
+		if (dev == ipgre_fb_tunnel_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = ipgre_tunnel_locate(&p, 0);
+		}
+		if (t == NULL)
+			t = (struct ip_tunnel*)dev->priv;
+		memcpy(&p, &t->parms, sizeof(p));
+		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			err = -EFAULT;
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			goto done;
+
+		err = -EINVAL;
+		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
+		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
+		    ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
+			goto done;
+		if (p.iph.ttl)
+			p.iph.frag_off |= htons(IP_DF);
+
+		if (!(p.i_flags&GRE_KEY))
+			p.i_key = 0;
+		if (!(p.o_flags&GRE_KEY))
+			p.o_key = 0;
+
+		t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+		if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				unsigned nflags=0;
+
+				t = (struct ip_tunnel*)dev->priv;
+
+				if (MULTICAST(p.iph.daddr))
+					nflags = IFF_BROADCAST;
+				else if (p.iph.daddr)
+					nflags = IFF_POINTOPOINT;
+
+				if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
+					err = -EINVAL;
+					break;
+				}
+				ipgre_tunnel_unlink(t);
+				t->parms.iph.saddr = p.iph.saddr;
+				t->parms.iph.daddr = p.iph.daddr;
+				t->parms.i_key = p.i_key;
+				t->parms.o_key = p.o_key;
+				memcpy(dev->dev_addr, &p.iph.saddr, 4);
+				memcpy(dev->broadcast, &p.iph.daddr, 4);
+				ipgre_tunnel_link(t);
+				netdev_state_change(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			if (cmd == SIOCCHGTUNNEL) {
+				t->parms.iph.ttl = p.iph.ttl;
+				t->parms.iph.tos = p.iph.tos;
+				t->parms.iph.frag_off = p.iph.frag_off;
+			}
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+				err = -EFAULT;
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == ipgre_fb_tunnel_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+				goto done;
+			err = -ENOENT;
+			if ((t = ipgre_tunnel_locate(&p, 0)) == NULL)
+				goto done;
+			err = -EPERM;
+			if (t == ipgre_fb_tunnel_dev->priv)
+				goto done;
+			dev = t->dev;
+		}
+		err = unregister_netdevice(dev);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
+{
+	return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+	if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+/* Nice toy. Unfortunately, useless in real life :-)
+   It allows to construct virtual multiprotocol broadcast "LAN"
+   over the Internet, provided multicast routing is tuned.
+
+
+   I have no idea was this bicycle invented before me,
+   so that I had to set ARPHRD_IPGRE to a random value.
+   I have an impression, that Cisco could make something similar,
+   but this feature is apparently missing in IOS<=11.2(8).
+   
+   I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
+   with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
+
+   ping -t 255 224.66.66.66
+
+   If nobody answers, mbone does not work.
+
+   ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
+   ip addr add 10.66.66.<somewhat>/24 dev Universe
+   ifconfig Universe up
+   ifconfig Universe add fe80::<Your_real_addr>/10
+   ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
+   ftp 10.66.66.66
+   ...
+   ftp fec0:6666:6666::193.233.7.65
+   ...
+
+ */
+
+static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type,
+			void *daddr, void *saddr, unsigned len)
+{
+	struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
+	u16 *p = (u16*)(iph+1);
+
+	memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
+	p[0]		= t->parms.o_flags;
+	p[1]		= htons(type);
+
+	/*
+	 *	Set the source hardware address. 
+	 */
+	 
+	if (saddr)
+		memcpy(&iph->saddr, saddr, 4);
+
+	if (daddr) {
+		memcpy(&iph->daddr, daddr, 4);
+		return t->hlen;
+	}
+	if (iph->daddr && !MULTICAST(iph->daddr))
+		return t->hlen;
+	
+	return -t->hlen;
+}
+
+static int ipgre_open(struct net_device *dev)
+{
+	struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+
+	if (MULTICAST(t->parms.iph.daddr)) {
+		struct flowi fl = { .oif = t->parms.link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = t->parms.iph.daddr,
+						.saddr = t->parms.iph.saddr,
+						.tos = RT_TOS(t->parms.iph.tos) } },
+				    .proto = IPPROTO_GRE };
+		struct rtable *rt;
+		if (ip_route_output_key(&rt, &fl))
+			return -EADDRNOTAVAIL;
+		dev = rt->u.dst.dev;
+		ip_rt_put(rt);
+		if (__in_dev_get(dev) == NULL)
+			return -EADDRNOTAVAIL;
+		t->mlink = dev->ifindex;
+		ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+	}
+	return 0;
+}
+
+static int ipgre_close(struct net_device *dev)
+{
+	struct ip_tunnel *t = (struct ip_tunnel*)dev->priv;
+	if (MULTICAST(t->parms.iph.daddr) && t->mlink) {
+		struct in_device *in_dev = inetdev_by_index(t->mlink);
+		if (in_dev) {
+			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
+			in_dev_put(in_dev);
+		}
+	}
+	return 0;
+}
+
+#endif
+
+static void ipgre_tunnel_setup(struct net_device *dev)
+{
+	SET_MODULE_OWNER(dev);
+	dev->uninit		= ipgre_tunnel_uninit;
+	dev->destructor 	= free_netdev;
+	dev->hard_start_xmit	= ipgre_tunnel_xmit;
+	dev->get_stats		= ipgre_tunnel_get_stats;
+	dev->do_ioctl		= ipgre_tunnel_ioctl;
+	dev->change_mtu		= ipgre_tunnel_change_mtu;
+
+	dev->type		= ARPHRD_IPGRE;
+	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
+	dev->mtu		= 1500 - sizeof(struct iphdr) - 4;
+	dev->flags		= IFF_NOARP;
+	dev->iflink		= 0;
+	dev->addr_len		= 4;
+}
+
+static int ipgre_tunnel_init(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel;
+	struct iphdr *iph;
+	int hlen = LL_MAX_HEADER;
+	int mtu = 1500;
+	int addend = sizeof(struct iphdr) + 4;
+
+	tunnel = (struct ip_tunnel*)dev->priv;
+	iph = &tunnel->parms.iph;
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	/* Guess output device to choose reasonable mtu and hard_header_len */
+
+	if (iph->daddr) {
+		struct flowi fl = { .oif = tunnel->parms.link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = iph->daddr,
+						.saddr = iph->saddr,
+						.tos = RT_TOS(iph->tos) } },
+				    .proto = IPPROTO_GRE };
+		struct rtable *rt;
+		if (!ip_route_output_key(&rt, &fl)) {
+			tdev = rt->u.dst.dev;
+			ip_rt_put(rt);
+		}
+
+		dev->flags |= IFF_POINTOPOINT;
+
+#ifdef CONFIG_NET_IPGRE_BROADCAST
+		if (MULTICAST(iph->daddr)) {
+			if (!iph->saddr)
+				return -EINVAL;
+			dev->flags = IFF_BROADCAST;
+			dev->hard_header = ipgre_header;
+			dev->open = ipgre_open;
+			dev->stop = ipgre_close;
+		}
+#endif
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(tunnel->parms.link);
+
+	if (tdev) {
+		hlen = tdev->hard_header_len;
+		mtu = tdev->mtu;
+	}
+	dev->iflink = tunnel->parms.link;
+
+	/* Precalculate GRE options length */
+	if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
+		if (tunnel->parms.o_flags&GRE_CSUM)
+			addend += 4;
+		if (tunnel->parms.o_flags&GRE_KEY)
+			addend += 4;
+		if (tunnel->parms.o_flags&GRE_SEQ)
+			addend += 4;
+	}
+	dev->hard_header_len = hlen + addend;
+	dev->mtu = mtu - addend;
+	tunnel->hlen = addend;
+	return 0;
+}
+
+int __init ipgre_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_GRE;
+	iph->ihl		= 5;
+	tunnel->hlen		= sizeof(struct iphdr) + 4;
+
+	dev_hold(dev);
+	tunnels_wc[0]		= tunnel;
+	return 0;
+}
+
+
+static struct net_protocol ipgre_protocol = {
+	.handler	=	ipgre_rcv,
+	.err_handler	=	ipgre_err,
+};
+
+
+/*
+ *	And now the modules code and kernel interface.
+ */
+
+static int __init ipgre_init(void)
+{
+	int err;
+
+	printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
+
+	if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
+		printk(KERN_INFO "ipgre init: can't add protocol\n");
+		return -EAGAIN;
+	}
+
+	ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
+					   ipgre_tunnel_setup);
+	if (!ipgre_fb_tunnel_dev) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init;
+
+	if ((err = register_netdev(ipgre_fb_tunnel_dev)))
+		goto err2;
+out:
+	return err;
+err2:
+	free_netdev(ipgre_fb_tunnel_dev);
+err1:
+	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+	goto out;
+}
+
+static void ipgre_fini(void)
+{
+	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+		printk(KERN_INFO "ipgre close: can't remove protocol\n");
+
+	unregister_netdev(ipgre_fb_tunnel_dev);
+}
+
+module_init(ipgre_init);
+module_exit(ipgre_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
new file mode 100644
index 000000000000..a0d0833034be
--- /dev/null
+++ b/net/ipv4/ip_input.c
@@ -0,0 +1,431 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The Internet Protocol (IP) module.
+ *
+ * Version:	$Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Donald Becker, <becker@super.org>
+ *		Alan Cox, <Alan.Cox@linux.org>
+ *		Richard Underwood
+ *		Stefan Becker, <stefanb@yello.ping.de>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		
+ *
+ * Fixes:
+ *		Alan Cox	:	Commented a couple of minor bits of surplus code
+ *		Alan Cox	:	Undefining IP_FORWARD doesn't include the code
+ *					(just stops a compiler warning).
+ *		Alan Cox	:	Frames with >=MAX_ROUTE record routes, strict routes or loose routes
+ *					are junked rather than corrupting things.
+ *		Alan Cox	:	Frames to bad broadcast subnets are dumped
+ *					We used to process them non broadcast and
+ *					boy could that cause havoc.
+ *		Alan Cox	:	ip_forward sets the free flag on the
+ *					new frame it queues. Still crap because
+ *					it copies the frame but at least it
+ *					doesn't eat memory too.
+ *		Alan Cox	:	Generic queue code and memory fixes.
+ *		Fred Van Kempen :	IP fragment support (borrowed from NET2E)
+ *		Gerhard Koerting:	Forward fragmented frames correctly.
+ *		Gerhard Koerting: 	Fixes to my fix of the above 8-).
+ *		Gerhard Koerting:	IP interface addressing fix.
+ *		Linus Torvalds	:	More robustness checks
+ *		Alan Cox	:	Even more checks: Still not as robust as it ought to be
+ *		Alan Cox	:	Save IP header pointer for later
+ *		Alan Cox	:	ip option setting
+ *		Alan Cox	:	Use ip_tos/ip_ttl settings
+ *		Alan Cox	:	Fragmentation bogosity removed
+ *					(Thanks to Mark.Bush@prg.ox.ac.uk)
+ *		Dmitry Gorodchanin :	Send of a raw packet crash fix.
+ *		Alan Cox	:	Silly ip bug when an overlength
+ *					fragment turns up. Now frees the
+ *					queue.
+ *		Linus Torvalds/ :	Memory leakage on fragmentation
+ *		Alan Cox	:	handling.
+ *		Gerhard Koerting:	Forwarding uses IP priority hints
+ *		Teemu Rantanen	:	Fragment problems.
+ *		Alan Cox	:	General cleanup, comments and reformat
+ *		Alan Cox	:	SNMP statistics
+ *		Alan Cox	:	BSD address rule semantics. Also see
+ *					UDP as there is a nasty checksum issue
+ *					if you do things the wrong way.
+ *		Alan Cox	:	Always defrag, moved IP_FORWARD to the config.in file
+ *		Alan Cox	: 	IP options adjust sk->priority.
+ *		Pedro Roque	:	Fix mtu/length error in ip_forward.
+ *		Alan Cox	:	Avoid ip_chk_addr when possible.
+ *	Richard Underwood	:	IP multicasting.
+ *		Alan Cox	:	Cleaned up multicast handlers.
+ *		Alan Cox	:	RAW sockets demultiplex in the BSD style.
+ *		Gunther Mayer	:	Fix the SNMP reporting typo
+ *		Alan Cox	:	Always in group 224.0.0.1
+ *	Pauline Middelink	:	Fast ip_checksum update when forwarding
+ *					Masquerading support.
+ *		Alan Cox	:	Multicast loopback error for 224.0.0.1
+ *		Alan Cox	:	IP_MULTICAST_LOOP option.
+ *		Alan Cox	:	Use notifiers.
+ *		Bjorn Ekwall	:	Removed ip_csum (from slhc.c too)
+ *		Bjorn Ekwall	:	Moved ip_fast_csum to ip.h (inline!)
+ *		Stefan Becker   :       Send out ICMP HOST REDIRECT
+ *	Arnt Gulbrandsen	:	ip_build_xmit
+ *		Alan Cox	:	Per socket routing cache
+ *		Alan Cox	:	Fixed routing cache, added header cache.
+ *		Alan Cox	:	Loopback didn't work right in original ip_build_xmit - fixed it.
+ *		Alan Cox	:	Only send ICMP_REDIRECT if src/dest are the same net.
+ *		Alan Cox	:	Incoming IP option handling.
+ *		Alan Cox	:	Set saddr on raw output frames as per BSD.
+ *		Alan Cox	:	Stopped broadcast source route explosions.
+ *		Alan Cox	:	Can disable source routing
+ *		Takeshi Sone    :	Masquerading didn't work.
+ *	Dave Bonn,Alan Cox	:	Faster IP forwarding whenever possible.
+ *		Alan Cox	:	Memory leaks, tramples, misc debugging.
+ *		Alan Cox	:	Fixed multicast (by popular demand 8))
+ *		Alan Cox	:	Fixed forwarding (by even more popular demand 8))
+ *		Alan Cox	:	Fixed SNMP statistics [I think]
+ *	Gerhard Koerting	:	IP fragmentation forwarding fix
+ *		Alan Cox	:	Device lock against page fault.
+ *		Alan Cox	:	IP_HDRINCL facility.
+ *	Werner Almesberger	:	Zero fragment bug
+ *		Alan Cox	:	RAW IP frame length bug
+ *		Alan Cox	:	Outgoing firewall on build_xmit
+ *		A.N.Kuznetsov	:	IP_OPTIONS support throughout the kernel
+ *		Alan Cox	:	Multicast routing hooks
+ *		Jos Vos		:	Do accounting *before* call_in_firewall
+ *	Willy Konynenberg	:	Transparent proxying support
+ *
+ *  
+ *
+ * To Fix:
+ *		IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
+ *		and could be made very efficient with the addition of some virtual memory hacks to permit
+ *		the allocation of a buffer that can then be 'grown' by twiddling page tables.
+ *		Output fragmentation wants updating along with the buffer management to use a single 
+ *		interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
+ *		output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
+ *		fragmentation anyway.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/xfrm.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ *	SNMP management statistics
+ */
+
+DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics);
+
+/*
+ *	Process Router Attention IP option
+ */ 
+int ip_call_ra_chain(struct sk_buff *skb)
+{
+	struct ip_ra_chain *ra;
+	u8 protocol = skb->nh.iph->protocol;
+	struct sock *last = NULL;
+
+	read_lock(&ip_ra_lock);
+	for (ra = ip_ra_chain; ra; ra = ra->next) {
+		struct sock *sk = ra->sk;
+
+		/* If socket is bound to an interface, only report
+		 * the packet if it came  from that interface.
+		 */
+		if (sk && inet_sk(sk)->num == protocol &&
+		    (!sk->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
+			if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+				skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN);
+				if (skb == NULL) {
+					read_unlock(&ip_ra_lock);
+					return 1;
+				}
+			}
+			if (last) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					raw_rcv(last, skb2);
+			}
+			last = sk;
+		}
+	}
+
+	if (last) {
+		raw_rcv(last, skb);
+		read_unlock(&ip_ra_lock);
+		return 1;
+	}
+	read_unlock(&ip_ra_lock);
+	return 0;
+}
+
+static inline int ip_local_deliver_finish(struct sk_buff *skb)
+{
+	int ihl = skb->nh.iph->ihl*4;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	nf_debug_ip_local_deliver(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+	__skb_pull(skb, ihl);
+
+	/* Free reference early: we don't need it any more, and it may
+           hold ip_conntrack module loaded indefinitely. */
+	nf_reset(skb);
+
+        /* Point into the IP datagram, just past the header. */
+        skb->h.raw = skb->data;
+
+	rcu_read_lock();
+	{
+		/* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */
+		int protocol = skb->nh.iph->protocol;
+		int hash;
+		struct sock *raw_sk;
+		struct net_protocol *ipprot;
+
+	resubmit:
+		hash = protocol & (MAX_INET_PROTOS - 1);
+		raw_sk = sk_head(&raw_v4_htable[hash]);
+
+		/* If there maybe a raw socket we must check - if not we
+		 * don't care less
+		 */
+		if (raw_sk)
+			raw_v4_input(skb, skb->nh.iph, hash);
+
+		if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
+			int ret;
+
+			if (!ipprot->no_policy &&
+			    !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+				kfree_skb(skb);
+				goto out;
+			}
+			ret = ipprot->handler(skb);
+			if (ret < 0) {
+				protocol = -ret;
+				goto resubmit;
+			}
+			IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+		} else {
+			if (!raw_sk) {
+				if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+					IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
+					icmp_send(skb, ICMP_DEST_UNREACH,
+						  ICMP_PROT_UNREACH, 0);
+				}
+			} else
+				IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
+			kfree_skb(skb);
+		}
+	}
+ out:
+	rcu_read_unlock();
+
+	return 0;
+}
+
+/*
+ * 	Deliver IP Packets to the higher protocol layers.
+ */ 
+int ip_local_deliver(struct sk_buff *skb)
+{
+	/*
+	 *	Reassemble IP fragments.
+	 */
+
+	if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+		skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER);
+		if (!skb)
+			return 0;
+	}
+
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL,
+		       ip_local_deliver_finish);
+}
+
+static inline int ip_rcv_finish(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct iphdr *iph = skb->nh.iph;
+
+	/*
+	 *	Initialise the virtual path cache for the packet. It describes
+	 *	how the packet travels inside Linux networking.
+	 */ 
+	if (skb->dst == NULL) {
+		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))
+			goto drop; 
+	}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (skb->dst->tclassid) {
+		struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id();
+		u32 idx = skb->dst->tclassid;
+		st[idx&0xFF].o_packets++;
+		st[idx&0xFF].o_bytes+=skb->len;
+		st[(idx>>16)&0xFF].i_packets++;
+		st[(idx>>16)&0xFF].i_bytes+=skb->len;
+	}
+#endif
+
+	if (iph->ihl > 5) {
+		struct ip_options *opt;
+
+		/* It looks as overkill, because not all
+		   IP options require packet mangling.
+		   But it is the easiest for now, especially taking
+		   into account that combination of IP options
+		   and running sniffer is extremely rare condition.
+		                                      --ANK (980813)
+		*/
+
+		if (skb_cow(skb, skb_headroom(skb))) {
+			IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+		iph = skb->nh.iph;
+
+		if (ip_options_compile(NULL, skb))
+			goto inhdr_error;
+
+		opt = &(IPCB(skb)->opt);
+		if (opt->srr) {
+			struct in_device *in_dev = in_dev_get(dev);
+			if (in_dev) {
+				if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
+					if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+						printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n",
+						       NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+					in_dev_put(in_dev);
+					goto drop;
+				}
+				in_dev_put(in_dev);
+			}
+			if (ip_options_rcv_srr(skb))
+				goto drop;
+		}
+	}
+
+	return dst_input(skb);
+
+inhdr_error:
+	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+        kfree_skb(skb);
+        return NET_RX_DROP;
+}
+
+/*
+ * 	Main IP Receive routine.
+ */ 
+int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+	struct iphdr *iph;
+
+	/* When the interface is in promisc. mode, drop all the crap
+	 * that it receives, do not try to analyse it.
+	 */
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {
+		IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+		goto out;
+	}
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
+	iph = skb->nh.iph;
+
+	/*
+	 *	RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+	 *
+	 *	Is the datagram acceptable?
+	 *
+	 *	1.	Length at least the size of an ip header
+	 *	2.	Version of 4
+	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+	 *	4.	Doesn't have a bogus length
+	 */
+
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error; 
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = skb->nh.iph;
+
+	if (ip_fast_csum((u8 *)iph, iph->ihl) != 0)
+		goto inhdr_error; 
+
+	{
+		__u32 len = ntohs(iph->tot_len); 
+		if (skb->len < len || len < (iph->ihl<<2))
+			goto inhdr_error;
+
+		/* Our transport medium may have padded the buffer out. Now we know it
+		 * is IP we can trim to the true length of the frame.
+		 * Note this now means skb->len holds ntohs(iph->tot_len).
+		 */
+		if (pskb_trim_rcsum(skb, len)) {
+			IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+
+	return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL,
+		       ip_rcv_finish);
+
+inhdr_error:
+	IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
+drop:
+        kfree_skb(skb);
+out:
+        return NET_RX_DROP;
+}
+
+EXPORT_SYMBOL(ip_rcv);
+EXPORT_SYMBOL(ip_statistics);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
new file mode 100644
index 000000000000..6d89f3f3e701
--- /dev/null
+++ b/net/ipv4/ip_options.c
@@ -0,0 +1,625 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The options processing module for ip.c
+ *
+ * Version:	$Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $
+ *
+ * Authors:	A.N.Kuznetsov
+ *		
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+
+/* 
+ * Write options to IP header, record destination address to
+ * source route option, address of outgoing interface
+ * (we should already know it, so that this  function is allowed be
+ * called only after routing decision) and timestamp,
+ * if we originate this datagram.
+ *
+ * daddr is real destination address, next hop is recorded in IP header.
+ * saddr is address of outgoing interface.
+ */
+
+void ip_options_build(struct sk_buff * skb, struct ip_options * opt,
+			    u32 daddr, struct rtable *rt, int is_frag) 
+{
+	unsigned char * iph = skb->nh.raw;
+
+	memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options));
+	memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen);
+	opt = &(IPCB(skb)->opt);
+	opt->is_data = 0;
+
+	if (opt->srr)
+		memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4);
+
+	if (!is_frag) {
+		if (opt->rr_needaddr)
+			ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt);
+		if (opt->ts_needaddr)
+			ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt);
+		if (opt->ts_needtime) {
+			struct timeval tv;
+			__u32 midtime;
+			do_gettimeofday(&tv);
+			midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+			memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
+		}
+		return;
+	}
+	if (opt->rr) {
+		memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]);
+		opt->rr = 0;
+		opt->rr_needaddr = 0;
+	}
+	if (opt->ts) {
+		memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]);
+		opt->ts = 0;
+		opt->ts_needaddr = opt->ts_needtime = 0;
+	}
+}
+
+/* 
+ * Provided (sopt, skb) points to received options,
+ * build in dopt compiled option set appropriate for answering.
+ * i.e. invert SRR option, copy anothers,
+ * and grab room in RR/TS options.
+ *
+ * NOTE: dopt cannot point to skb.
+ */
+
+int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) 
+{
+	struct ip_options *sopt;
+	unsigned char *sptr, *dptr;
+	int soffset, doffset;
+	int	optlen;
+	u32	daddr;
+
+	memset(dopt, 0, sizeof(struct ip_options));
+
+	dopt->is_data = 1;
+
+	sopt = &(IPCB(skb)->opt);
+
+	if (sopt->optlen == 0) {
+		dopt->optlen = 0;
+		return 0;
+	}
+
+	sptr = skb->nh.raw;
+	dptr = dopt->__data;
+
+	if (skb->dst)
+		daddr = ((struct rtable*)skb->dst)->rt_spec_dst;
+	else
+		daddr = skb->nh.iph->daddr;
+
+	if (sopt->rr) {
+		optlen  = sptr[sopt->rr+1];
+		soffset = sptr[sopt->rr+2];
+		dopt->rr = dopt->optlen + sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->rr, optlen);
+		if (sopt->rr_needaddr && soffset <= optlen) {
+			if (soffset + 3 > optlen)
+				return -EINVAL;
+			dptr[2] = soffset + 4;
+			dopt->rr_needaddr = 1;
+		}
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	if (sopt->ts) {
+		optlen = sptr[sopt->ts+1];
+		soffset = sptr[sopt->ts+2];
+		dopt->ts = dopt->optlen + sizeof(struct iphdr);
+		memcpy(dptr, sptr+sopt->ts, optlen);
+		if (soffset <= optlen) {
+			if (sopt->ts_needaddr) {
+				if (soffset + 3 > optlen)
+					return -EINVAL;
+				dopt->ts_needaddr = 1;
+				soffset += 4;
+			}
+			if (sopt->ts_needtime) {
+				if (soffset + 3 > optlen)
+					return -EINVAL;
+				if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) {
+					dopt->ts_needtime = 1;
+					soffset += 4;
+				} else {
+					dopt->ts_needtime = 0;
+
+					if (soffset + 8 <= optlen) {
+						__u32 addr;
+
+						memcpy(&addr, sptr+soffset-1, 4);
+						if (inet_addr_type(addr) != RTN_LOCAL) {
+							dopt->ts_needtime = 1;
+							soffset += 8;
+						}
+					}
+				}
+			}
+			dptr[2] = soffset;
+		}
+		dptr += optlen;
+		dopt->optlen += optlen;
+	}
+	if (sopt->srr) {
+		unsigned char * start = sptr+sopt->srr;
+		u32 faddr;
+
+		optlen  = start[1];
+		soffset = start[2];
+		doffset = 0;
+		if (soffset > optlen)
+			soffset = optlen + 1;
+		soffset -= 4;
+		if (soffset > 3) {
+			memcpy(&faddr, &start[soffset-1], 4);
+			for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4)
+				memcpy(&dptr[doffset-1], &start[soffset-1], 4);
+			/*
+			 * RFC1812 requires to fix illegal source routes.
+			 */
+			if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0)
+				doffset -= 4;
+		}
+		if (doffset > 3) {
+			memcpy(&start[doffset-1], &daddr, 4);
+			dopt->faddr = faddr;
+			dptr[0] = start[0];
+			dptr[1] = doffset+3;
+			dptr[2] = 4;
+			dptr += doffset+3;
+			dopt->srr = dopt->optlen + sizeof(struct iphdr);
+			dopt->optlen += doffset+3;
+			dopt->is_strictroute = sopt->is_strictroute;
+		}
+	}
+	while (dopt->optlen & 3) {
+		*dptr++ = IPOPT_END;
+		dopt->optlen++;
+	}
+	return 0;
+}
+
+/*
+ *	Options "fragmenting", just fill options not
+ *	allowed in fragments with NOOPs.
+ *	Simple and stupid 8), but the most efficient way.
+ */
+
+void ip_options_fragment(struct sk_buff * skb) 
+{
+	unsigned char * optptr = skb->nh.raw;
+	struct ip_options * opt = &(IPCB(skb)->opt);
+	int  l = opt->optlen;
+	int  optlen;
+
+	while (l > 0) {
+		switch (*optptr) {
+		case IPOPT_END:
+			return;
+		case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l)
+		  return;
+		if (!IPOPT_COPIED(*optptr))
+			memset(optptr, IPOPT_NOOP, optlen);
+		l -= optlen;
+		optptr += optlen;
+	}
+	opt->ts = 0;
+	opt->rr = 0;
+	opt->rr_needaddr = 0;
+	opt->ts_needaddr = 0;
+	opt->ts_needtime = 0;
+	return;
+}
+
+/*
+ * Verify options and fill pointers in struct options.
+ * Caller should clear *opt, and set opt->data.
+ * If opt == NULL, then skb->data should point to IP header.
+ */
+
+int ip_options_compile(struct ip_options * opt, struct sk_buff * skb)
+{
+	int l;
+	unsigned char * iph;
+	unsigned char * optptr;
+	int optlen;
+	unsigned char * pp_ptr = NULL;
+	struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL;
+
+	if (!opt) {
+		opt = &(IPCB(skb)->opt);
+		memset(opt, 0, sizeof(struct ip_options));
+		iph = skb->nh.raw;
+		opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr);
+		optptr = iph + sizeof(struct iphdr);
+		opt->is_data = 0;
+	} else {
+		optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]);
+		iph = optptr - sizeof(struct iphdr);
+	}
+
+	for (l = opt->optlen; l > 0; ) {
+		switch (*optptr) {
+		      case IPOPT_END:
+			for (optptr++, l--; l>0; optptr++, l--) {
+				if (*optptr != IPOPT_END) {
+					*optptr = IPOPT_END;
+					opt->is_changed = 1;
+				}
+			}
+			goto eol;
+		      case IPOPT_NOOP:
+			l--;
+			optptr++;
+			continue;
+		}
+		optlen = optptr[1];
+		if (optlen<2 || optlen>l) {
+			pp_ptr = optptr;
+			goto error;
+		}
+		switch (*optptr) {
+		      case IPOPT_SSRR:
+		      case IPOPT_LSRR:
+			if (optlen < 3) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 4) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			/* NB: cf RFC-1812 5.2.4.1 */
+			if (opt->srr) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (!skb) {
+				if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) {
+					pp_ptr = optptr + 1;
+					goto error;
+				}
+				memcpy(&opt->faddr, &optptr[3], 4);
+				if (optlen > 7)
+					memmove(&optptr[3], &optptr[7], optlen-7);
+			}
+			opt->is_strictroute = (optptr[0] == IPOPT_SSRR);
+			opt->srr = optptr - iph;
+			break;
+		      case IPOPT_RR:
+			if (opt->rr) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (optlen < 3) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 4) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			if (optptr[2] <= optlen) {
+				if (optptr[2]+3 > optlen) {
+					pp_ptr = optptr + 2;
+					goto error;
+				}
+				if (skb) {
+					memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+					opt->is_changed = 1;
+				}
+				optptr[2] += 4;
+				opt->rr_needaddr = 1;
+			}
+			opt->rr = optptr - iph;
+			break;
+		      case IPOPT_TIMESTAMP:
+			if (opt->ts) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			if (optlen < 4) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] < 5) {
+				pp_ptr = optptr + 2;
+				goto error;
+			}
+			if (optptr[2] <= optlen) {
+				__u32 * timeptr = NULL;
+				if (optptr[2]+3 > optptr[1]) {
+					pp_ptr = optptr + 2;
+					goto error;
+				}
+				switch (optptr[3]&0xF) {
+				      case IPOPT_TS_TSONLY:
+					opt->ts = optptr - iph;
+					if (skb) 
+						timeptr = (__u32*)&optptr[optptr[2]-1];
+					opt->ts_needtime = 1;
+					optptr[2] += 4;
+					break;
+				      case IPOPT_TS_TSANDADDR:
+					if (optptr[2]+7 > optptr[1]) {
+						pp_ptr = optptr + 2;
+						goto error;
+					}
+					opt->ts = optptr - iph;
+					if (skb) {
+						memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4);
+						timeptr = (__u32*)&optptr[optptr[2]+3];
+					}
+					opt->ts_needaddr = 1;
+					opt->ts_needtime = 1;
+					optptr[2] += 8;
+					break;
+				      case IPOPT_TS_PRESPEC:
+					if (optptr[2]+7 > optptr[1]) {
+						pp_ptr = optptr + 2;
+						goto error;
+					}
+					opt->ts = optptr - iph;
+					{
+						u32 addr;
+						memcpy(&addr, &optptr[optptr[2]-1], 4);
+						if (inet_addr_type(addr) == RTN_UNICAST)
+							break;
+						if (skb)
+							timeptr = (__u32*)&optptr[optptr[2]+3];
+					}
+					opt->ts_needtime = 1;
+					optptr[2] += 8;
+					break;
+				      default:
+					if (!skb && !capable(CAP_NET_RAW)) {
+						pp_ptr = optptr + 3;
+						goto error;
+					}
+					break;
+				}
+				if (timeptr) {
+					struct timeval tv;
+					__u32  midtime;
+					do_gettimeofday(&tv);
+					midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000);
+					memcpy(timeptr, &midtime, sizeof(__u32));
+					opt->is_changed = 1;
+				}
+			} else {
+				unsigned overflow = optptr[3]>>4;
+				if (overflow == 15) {
+					pp_ptr = optptr + 3;
+					goto error;
+				}
+				opt->ts = optptr - iph;
+				if (skb) {
+					optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4);
+					opt->is_changed = 1;
+				}
+			}
+			break;
+		      case IPOPT_RA:
+			if (optlen < 4) {
+				pp_ptr = optptr + 1;
+				goto error;
+			}
+			if (optptr[2] == 0 && optptr[3] == 0)
+				opt->router_alert = optptr - iph;
+			break;
+		      case IPOPT_SEC:
+		      case IPOPT_SID:
+		      default:
+			if (!skb && !capable(CAP_NET_RAW)) {
+				pp_ptr = optptr;
+				goto error;
+			}
+			break;
+		}
+		l -= optlen;
+		optptr += optlen;
+	}
+
+eol:
+	if (!pp_ptr)
+		return 0;
+
+error:
+	if (skb) {
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24));
+	}
+	return -EINVAL;
+}
+
+
+/*
+ *	Undo all the changes done by ip_options_compile().
+ */
+
+void ip_options_undo(struct ip_options * opt)
+{
+	if (opt->srr) {
+		unsigned  char * optptr = opt->__data+opt->srr-sizeof(struct  iphdr);
+		memmove(optptr+7, optptr+3, optptr[1]-7);
+		memcpy(optptr+3, &opt->faddr, 4);
+	}
+	if (opt->rr_needaddr) {
+		unsigned  char * optptr = opt->__data+opt->rr-sizeof(struct  iphdr);
+		optptr[2] -= 4;
+		memset(&optptr[optptr[2]-1], 0, 4);
+	}
+	if (opt->ts) {
+		unsigned  char * optptr = opt->__data+opt->ts-sizeof(struct  iphdr);
+		if (opt->ts_needtime) {
+			optptr[2] -= 4;
+			memset(&optptr[optptr[2]-1], 0, 4);
+			if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC)
+				optptr[2] -= 4;
+		}
+		if (opt->ts_needaddr) {
+			optptr[2] -= 4;
+			memset(&optptr[optptr[2]-1], 0, 4);
+		}
+	}
+}
+
+int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user)
+{
+	struct ip_options *opt;
+
+	opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL);
+	if (!opt)
+		return -ENOMEM;
+	memset(opt, 0, sizeof(struct ip_options));
+	if (optlen) {
+		if (user) {
+			if (copy_from_user(opt->__data, data, optlen)) {
+				kfree(opt);
+				return -EFAULT;
+			}
+		} else
+			memcpy(opt->__data, data, optlen);
+	}
+	while (optlen & 3)
+		opt->__data[optlen++] = IPOPT_END;
+	opt->optlen = optlen;
+	opt->is_data = 1;
+	opt->is_setbyuser = 1;
+	if (optlen && ip_options_compile(opt, NULL)) {
+		kfree(opt);
+		return -EINVAL;
+	}
+	if (*optp)
+		kfree(*optp);
+	*optp = opt;
+	return 0;
+}
+
+void ip_forward_options(struct sk_buff *skb)
+{
+	struct   ip_options * opt	= &(IPCB(skb)->opt);
+	unsigned char * optptr;
+	struct rtable *rt = (struct rtable*)skb->dst;
+	unsigned char *raw = skb->nh.raw;
+
+	if (opt->rr_needaddr) {
+		optptr = (unsigned char *)raw + opt->rr;
+		ip_rt_get_source(&optptr[optptr[2]-5], rt);
+		opt->is_changed = 1;
+	}
+	if (opt->srr_is_hit) {
+		int srrptr, srrspace;
+
+		optptr = raw + opt->srr;
+
+		for ( srrptr=optptr[2], srrspace = optptr[1];
+		     srrptr <= srrspace;
+		     srrptr += 4
+		     ) {
+			if (srrptr + 3 > srrspace)
+				break;
+			if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0)
+				break;
+		}
+		if (srrptr + 3 <= srrspace) {
+			opt->is_changed = 1;
+			ip_rt_get_source(&optptr[srrptr-1], rt);
+			skb->nh.iph->daddr = rt->rt_dst;
+			optptr[2] = srrptr+4;
+		} else if (net_ratelimit())
+			printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n");
+		if (opt->ts_needaddr) {
+			optptr = raw + opt->ts;
+			ip_rt_get_source(&optptr[optptr[2]-9], rt);
+			opt->is_changed = 1;
+		}
+	}
+	if (opt->is_changed) {
+		opt->is_changed = 0;
+		ip_send_check(skb->nh.iph);
+	}
+}
+
+int ip_options_rcv_srr(struct sk_buff *skb)
+{
+	struct ip_options *opt = &(IPCB(skb)->opt);
+	int srrspace, srrptr;
+	u32 nexthop;
+	struct iphdr *iph = skb->nh.iph;
+	unsigned char * optptr = skb->nh.raw + opt->srr;
+	struct rtable *rt = (struct rtable*)skb->dst;
+	struct rtable *rt2;
+	int err;
+
+	if (!opt->srr)
+		return 0;
+
+	if (skb->pkt_type != PACKET_HOST)
+		return -EINVAL;
+	if (rt->rt_type == RTN_UNICAST) {
+		if (!opt->is_strictroute)
+			return 0;
+		icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24));
+		return -EINVAL;
+	}
+	if (rt->rt_type != RTN_LOCAL)
+		return -EINVAL;
+
+	for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) {
+		if (srrptr + 3 > srrspace) {
+			icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24));
+			return -EINVAL;
+		}
+		memcpy(&nexthop, &optptr[srrptr-1], 4);
+
+		rt = (struct rtable*)skb->dst;
+		skb->dst = NULL;
+		err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev);
+		rt2 = (struct rtable*)skb->dst;
+		if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
+			ip_rt_put(rt2);
+			skb->dst = &rt->u.dst;
+			return -EINVAL;
+		}
+		ip_rt_put(rt);
+		if (rt2->rt_type != RTN_LOCAL)
+			break;
+		/* Superfast 8) loopback forward */
+		memcpy(&iph->daddr, &optptr[srrptr-1], 4);
+		opt->is_changed = 1;
+	}
+	if (srrptr <= srrspace) {
+		opt->srr_is_hit = 1;
+		opt->is_changed = 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(ip_options_compile);
+EXPORT_SYMBOL(ip_options_undo);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
new file mode 100644
index 000000000000..30ab7b6ab761
--- /dev/null
+++ b/net/ipv4/ip_output.c
@@ -0,0 +1,1359 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The Internet Protocol (IP) output module.
+ *
+ * Version:	$Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Donald Becker, <becker@super.org>
+ *		Alan Cox, <Alan.Cox@linux.org>
+ *		Richard Underwood
+ *		Stefan Becker, <stefanb@yello.ping.de>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ *	See ip_input.c for original log
+ *
+ *	Fixes:
+ *		Alan Cox	:	Missing nonblock feature in ip_build_xmit.
+ *		Mike Kilburn	:	htons() missing in ip_build_xmit.
+ *		Bradford Johnson:	Fix faulty handling of some frames when 
+ *					no route is found.
+ *		Alexander Demenshin:	Missing sk/skb free in ip_queue_xmit
+ *					(in case if packet not accepted by
+ *					output firewall rules)
+ *		Mike McLagan	:	Routing by source
+ *		Alexey Kuznetsov:	use new route cache
+ *		Andi Kleen:		Fix broken PMTU recovery and remove
+ *					some redundant tests.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *		Andi Kleen	: 	Replace ip_reply with ip_send_reply.
+ *		Andi Kleen	:	Split fast and slow ip_build_xmit path 
+ *					for decreased register pressure on x86 
+ *					and more readibility. 
+ *		Marc Boucher	:	When call_out_firewall returns FW_QUEUE,
+ *					silently drop skb instead of failing with -EPERM.
+ *		Detlev Wengorz	:	Copy protocol for fragments.
+ *		Hirokazu Takahashi:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *		Hirokazu Takahashi:	sendfile() on UDP works now.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/config.h>
+
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/init.h>
+
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/arp.h>
+#include <net/icmp.h>
+#include <net/raw.h>
+#include <net/checksum.h>
+#include <net/inetpeer.h>
+#include <net/checksum.h>
+#include <linux/igmp.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/mroute.h>
+#include <linux/netlink.h>
+
+/*
+ *      Shall we try to damage output packets if routing dev changes?
+ */
+
+int sysctl_ip_dynaddr;
+int sysctl_ip_default_ttl = IPDEFTTL;
+
+/* Generate a checksum for an outgoing IP datagram. */
+__inline__ void ip_send_check(struct iphdr *iph)
+{
+	iph->check = 0;
+	iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+}
+
+/* dev_loopback_xmit for use with netfilter. */
+static int ip_dev_loopback_xmit(struct sk_buff *newskb)
+{
+	newskb->mac.raw = newskb->data;
+	__skb_pull(newskb, newskb->nh.raw - newskb->data);
+	newskb->pkt_type = PACKET_LOOPBACK;
+	newskb->ip_summed = CHECKSUM_UNNECESSARY;
+	BUG_TRAP(newskb->dst);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	nf_debug_ip_loopback_xmit(newskb);
+#endif
+	netif_rx(newskb);
+	return 0;
+}
+
+static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
+{
+	int ttl = inet->uc_ttl;
+
+	if (ttl < 0)
+		ttl = dst_metric(dst, RTAX_HOPLIMIT);
+	return ttl;
+}
+
+/* 
+ *		Add an ip header to a skbuff and send it out.
+ *
+ */
+int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
+			  u32 saddr, u32 daddr, struct ip_options *opt)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = (struct rtable *)skb->dst;
+	struct iphdr *iph;
+
+	/* Build the IP header. */
+	if (opt)
+		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen);
+	else
+		iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+	iph->version  = 4;
+	iph->ihl      = 5;
+	iph->tos      = inet->tos;
+	if (ip_dont_fragment(sk, &rt->u.dst))
+		iph->frag_off = htons(IP_DF);
+	else
+		iph->frag_off = 0;
+	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
+	iph->daddr    = rt->rt_dst;
+	iph->saddr    = rt->rt_src;
+	iph->protocol = sk->sk_protocol;
+	iph->tot_len  = htons(skb->len);
+	ip_select_ident(iph, &rt->u.dst, sk);
+	skb->nh.iph   = iph;
+
+	if (opt && opt->optlen) {
+		iph->ihl += opt->optlen>>2;
+		ip_options_build(skb, opt, daddr, rt, 0);
+	}
+	ip_send_check(iph);
+
+	skb->priority = sk->sk_priority;
+
+	/* Send it out. */
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		       dst_output);
+}
+
+static inline int ip_finish_output2(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct hh_cache *hh = dst->hh;
+	struct net_device *dev = dst->dev;
+	int hh_len = LL_RESERVED_SPACE(dev);
+
+	/* Be paranoid, rather than too clever. */
+	if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (skb2 == NULL) {
+			kfree_skb(skb);
+			return -ENOMEM;
+		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		kfree_skb(skb);
+		skb = skb2;
+	}
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	nf_debug_ip_finish_output2(skb);
+#endif /*CONFIG_NETFILTER_DEBUG*/
+
+	if (hh) {
+		int hh_alen;
+
+		read_lock_bh(&hh->hh_lock);
+		hh_alen = HH_DATA_ALIGN(hh->hh_len);
+  		memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+		read_unlock_bh(&hh->hh_lock);
+	        skb_push(skb, hh->hh_len);
+		return hh->hh_output(skb);
+	} else if (dst->neighbour)
+		return dst->neighbour->output(skb);
+
+	if (net_ratelimit())
+		printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
+	kfree_skb(skb);
+	return -EINVAL;
+}
+
+int ip_finish_output(struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dst->dev;
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev,
+		       ip_finish_output2);
+}
+
+int ip_mc_output(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	struct rtable *rt = (struct rtable*)skb->dst;
+	struct net_device *dev = rt->u.dst.dev;
+
+	/*
+	 *	If the indicated interface is up and running, send the packet.
+	 */
+	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+
+	/*
+	 *	Multicasts are looped back for other local users
+	 */
+
+	if (rt->rt_flags&RTCF_MULTICAST) {
+		if ((!sk || inet_sk(sk)->mc_loop)
+#ifdef CONFIG_IP_MROUTE
+		/* Small optimization: do not loopback not local frames,
+		   which returned after forwarding; they will be  dropped
+		   by ip_mr_input in any case.
+		   Note, that local frames are looped back to be delivered
+		   to local recipients.
+
+		   This check is duplicated in ip_mr_input at the moment.
+		 */
+		    && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+#endif
+		) {
+			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+			if (newskb)
+				NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+					newskb->dev, 
+					ip_dev_loopback_xmit);
+		}
+
+		/* Multicasts with ttl 0 must not go beyond the host */
+
+		if (skb->nh.iph->ttl == 0) {
+			kfree_skb(skb);
+			return 0;
+		}
+	}
+
+	if (rt->rt_flags&RTCF_BROADCAST) {
+		struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
+		if (newskb)
+			NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL,
+				newskb->dev, ip_dev_loopback_xmit);
+	}
+
+	if (skb->len > dst_mtu(&rt->u.dst))
+		return ip_fragment(skb, ip_finish_output);
+	else
+		return ip_finish_output(skb);
+}
+
+int ip_output(struct sk_buff *skb)
+{
+	IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
+
+	if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size)
+		return ip_fragment(skb, ip_finish_output);
+	else
+		return ip_finish_output(skb);
+}
+
+int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
+{
+	struct sock *sk = skb->sk;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options *opt = inet->opt;
+	struct rtable *rt;
+	struct iphdr *iph;
+
+	/* Skip all of this if the packet is already routed,
+	 * f.e. by something like SCTP.
+	 */
+	rt = (struct rtable *) skb->dst;
+	if (rt != NULL)
+		goto packet_routed;
+
+	/* Make sure we can route this packet. */
+	rt = (struct rtable *)__sk_dst_check(sk, 0);
+	if (rt == NULL) {
+		u32 daddr;
+
+		/* Use correct destination address if we have options. */
+		daddr = inet->daddr;
+		if(opt && opt->srr)
+			daddr = opt->faddr;
+
+		{
+			struct flowi fl = { .oif = sk->sk_bound_dev_if,
+					    .nl_u = { .ip4_u =
+						      { .daddr = daddr,
+							.saddr = inet->saddr,
+							.tos = RT_CONN_FLAGS(sk) } },
+					    .proto = sk->sk_protocol,
+					    .uli_u = { .ports =
+						       { .sport = inet->sport,
+							 .dport = inet->dport } } };
+
+			/* If this fails, retransmit mechanism of transport layer will
+			 * keep trying until route appears or the connection times
+			 * itself out.
+			 */
+			if (ip_route_output_flow(&rt, &fl, sk, 0))
+				goto no_route;
+		}
+		__sk_dst_set(sk, &rt->u.dst);
+		tcp_v4_setup_caps(sk, &rt->u.dst);
+	}
+	skb->dst = dst_clone(&rt->u.dst);
+
+packet_routed:
+	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
+		goto no_route;
+
+	/* OK, we know where to send it, allocate and build IP header. */
+	iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
+	*((__u16 *)iph)	= htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
+	iph->tot_len = htons(skb->len);
+	if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
+		iph->frag_off = htons(IP_DF);
+	else
+		iph->frag_off = 0;
+	iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
+	iph->protocol = sk->sk_protocol;
+	iph->saddr    = rt->rt_src;
+	iph->daddr    = rt->rt_dst;
+	skb->nh.iph   = iph;
+	/* Transport layer set skb->h.foo itself. */
+
+	if (opt && opt->optlen) {
+		iph->ihl += opt->optlen >> 2;
+		ip_options_build(skb, opt, inet->daddr, rt, 0);
+	}
+
+	ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs);
+
+	/* Add an IP checksum. */
+	ip_send_check(iph);
+
+	skb->priority = sk->sk_priority;
+
+	return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		       dst_output);
+
+no_route:
+	IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
+	kfree_skb(skb);
+	return -EHOSTUNREACH;
+}
+
+
+static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
+{
+	to->pkt_type = from->pkt_type;
+	to->priority = from->priority;
+	to->protocol = from->protocol;
+	to->security = from->security;
+	dst_release(to->dst);
+	to->dst = dst_clone(from->dst);
+	to->dev = from->dev;
+
+	/* Copy the flags to each fragment. */
+	IPCB(to)->flags = IPCB(from)->flags;
+
+#ifdef CONFIG_NET_SCHED
+	to->tc_index = from->tc_index;
+#endif
+#ifdef CONFIG_NETFILTER
+	to->nfmark = from->nfmark;
+	to->nfcache = from->nfcache;
+	/* Connection association is same as pre-frag packet */
+	nf_conntrack_put(to->nfct);
+	to->nfct = from->nfct;
+	nf_conntrack_get(to->nfct);
+	to->nfctinfo = from->nfctinfo;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	nf_bridge_put(to->nf_bridge);
+	to->nf_bridge = from->nf_bridge;
+	nf_bridge_get(to->nf_bridge);
+#endif
+#ifdef CONFIG_NETFILTER_DEBUG
+	to->nf_debug = from->nf_debug;
+#endif
+#endif
+}
+
+/*
+ *	This IP datagram is too large to be sent in one piece.  Break it up into
+ *	smaller pieces (each of size equal to IP header plus
+ *	a block of the data of the original IP data part) that will yet fit in a
+ *	single device frame, and queue such a frame for sending.
+ */
+
+int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+{
+	struct iphdr *iph;
+	int raw = 0;
+	int ptr;
+	struct net_device *dev;
+	struct sk_buff *skb2;
+	unsigned int mtu, hlen, left, len, ll_rs;
+	int offset;
+	int not_last_frag;
+	struct rtable *rt = (struct rtable*)skb->dst;
+	int err = 0;
+
+	dev = rt->u.dst.dev;
+
+	/*
+	 *	Point into the IP datagram header.
+	 */
+
+	iph = skb->nh.iph;
+
+	if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+			  htonl(dst_mtu(&rt->u.dst)));
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+
+	/*
+	 *	Setup starting values.
+	 */
+
+	hlen = iph->ihl * 4;
+	mtu = dst_mtu(&rt->u.dst) - hlen;	/* Size of data space */
+
+	/* When frag_list is given, use it. First, check its validity:
+	 * some transformers could create wrong frag_list or break existing
+	 * one, it is not prohibited. In this case fall back to copying.
+	 *
+	 * LATER: this step can be merged to real generation of fragments,
+	 * we can switch to copy when see the first bad fragment.
+	 */
+	if (skb_shinfo(skb)->frag_list) {
+		struct sk_buff *frag;
+		int first_len = skb_pagelen(skb);
+
+		if (first_len - hlen > mtu ||
+		    ((first_len - hlen) & 7) ||
+		    (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
+		    skb_cloned(skb))
+			goto slow_path;
+
+		for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
+			/* Correct geometry. */
+			if (frag->len > mtu ||
+			    ((frag->len & 7) && frag->next) ||
+			    skb_headroom(frag) < hlen)
+			    goto slow_path;
+
+			/* Partially cloned skb? */
+			if (skb_shared(frag))
+				goto slow_path;
+		}
+
+		/* Everything is OK. Generate! */
+
+		err = 0;
+		offset = 0;
+		frag = skb_shinfo(skb)->frag_list;
+		skb_shinfo(skb)->frag_list = NULL;
+		skb->data_len = first_len - skb_headlen(skb);
+		skb->len = first_len;
+		iph->tot_len = htons(first_len);
+		iph->frag_off = htons(IP_MF);
+		ip_send_check(iph);
+
+		for (;;) {
+			/* Prepare header of the next frame,
+			 * before previous one went down. */
+			if (frag) {
+				frag->ip_summed = CHECKSUM_NONE;
+				frag->h.raw = frag->data;
+				frag->nh.raw = __skb_push(frag, hlen);
+				memcpy(frag->nh.raw, iph, hlen);
+				iph = frag->nh.iph;
+				iph->tot_len = htons(frag->len);
+				ip_copy_metadata(frag, skb);
+				if (offset == 0)
+					ip_options_fragment(frag);
+				offset += skb->len - hlen;
+				iph->frag_off = htons(offset>>3);
+				if (frag->next != NULL)
+					iph->frag_off |= htons(IP_MF);
+				/* Ready, complete checksum */
+				ip_send_check(iph);
+			}
+
+			err = output(skb);
+
+			if (err || !frag)
+				break;
+
+			skb = frag;
+			frag = skb->next;
+			skb->next = NULL;
+		}
+
+		if (err == 0) {
+			IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+			return 0;
+		}
+
+		while (frag) {
+			skb = frag->next;
+			kfree_skb(frag);
+			frag = skb;
+		}
+		IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+		return err;
+	}
+
+slow_path:
+	left = skb->len - hlen;		/* Space per frame */
+	ptr = raw + hlen;		/* Where to start from */
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+	/* for bridged IP traffic encapsulated inside f.e. a vlan header,
+	 * we need to make room for the encapsulating header */
+	ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb));
+	mtu -= nf_bridge_pad(skb);
+#else
+	ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev);
+#endif
+	/*
+	 *	Fragment the datagram.
+	 */
+
+	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
+	not_last_frag = iph->frag_off & htons(IP_MF);
+
+	/*
+	 *	Keep copying data until we run out.
+	 */
+
+	while(left > 0)	{
+		len = left;
+		/* IF: it doesn't fit, use 'mtu' - the data space left */
+		if (len > mtu)
+			len = mtu;
+		/* IF: we are not sending upto and including the packet end
+		   then align the next start on an eight byte boundary */
+		if (len < left)	{
+			len &= ~7;
+		}
+		/*
+		 *	Allocate buffer.
+		 */
+
+		if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
+			NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n"));
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		/*
+		 *	Set up data on packet
+		 */
+
+		ip_copy_metadata(skb2, skb);
+		skb_reserve(skb2, ll_rs);
+		skb_put(skb2, len + hlen);
+		skb2->nh.raw = skb2->data;
+		skb2->h.raw = skb2->data + hlen;
+
+		/*
+		 *	Charge the memory for the fragment to any owner
+		 *	it might possess
+		 */
+
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+
+		/*
+		 *	Copy the packet header into the new buffer.
+		 */
+
+		memcpy(skb2->nh.raw, skb->data, hlen);
+
+		/*
+		 *	Copy a block of the IP datagram.
+		 */
+		if (skb_copy_bits(skb, ptr, skb2->h.raw, len))
+			BUG();
+		left -= len;
+
+		/*
+		 *	Fill in the new header fields.
+		 */
+		iph = skb2->nh.iph;
+		iph->frag_off = htons((offset >> 3));
+
+		/* ANK: dirty, but effective trick. Upgrade options only if
+		 * the segment to be fragmented was THE FIRST (otherwise,
+		 * options are already fixed) and make it ONCE
+		 * on the initial skb, so that all the following fragments
+		 * will inherit fixed options.
+		 */
+		if (offset == 0)
+			ip_options_fragment(skb);
+
+		/*
+		 *	Added AC : If we are fragmenting a fragment that's not the
+		 *		   last fragment then keep MF on each bit
+		 */
+		if (left > 0 || not_last_frag)
+			iph->frag_off |= htons(IP_MF);
+		ptr += len;
+		offset += len;
+
+		/*
+		 *	Put this fragment into the sending queue.
+		 */
+
+		IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
+
+		iph->tot_len = htons(len + hlen);
+
+		ip_send_check(iph);
+
+		err = output(skb2);
+		if (err)
+			goto fail;
+	}
+	kfree_skb(skb);
+	IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
+	return err;
+
+fail:
+	kfree_skb(skb); 
+	IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
+	return err;
+}
+
+int
+ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
+{
+	struct iovec *iov = from;
+
+	if (skb->ip_summed == CHECKSUM_HW) {
+		if (memcpy_fromiovecend(to, iov, offset, len) < 0)
+			return -EFAULT;
+	} else {
+		unsigned int csum = 0;
+		if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
+			return -EFAULT;
+		skb->csum = csum_block_add(skb->csum, csum, odd);
+	}
+	return 0;
+}
+
+static inline unsigned int
+csum_page(struct page *page, int offset, int copy)
+{
+	char *kaddr;
+	unsigned int csum;
+	kaddr = kmap(page);
+	csum = csum_partial(kaddr + offset, copy, 0);
+	kunmap(page);
+	return csum;
+}
+
+/*
+ *	ip_append_data() and ip_append_page() can make one large IP datagram
+ *	from many pieces of data. Each pieces will be holded on the socket
+ *	until ip_push_pending_frames() is called. Each piece can be a page
+ *	or non-page data.
+ *	
+ *	Not only UDP, other transport protocols - e.g. raw sockets - can use
+ *	this interface potentially.
+ *
+ *	LATER: length must be adjusted by pad at tail, when it is required.
+ */
+int ip_append_data(struct sock *sk,
+		   int getfrag(void *from, char *to, int offset, int len,
+			       int odd, struct sk_buff *skb),
+		   void *from, int length, int transhdrlen,
+		   struct ipcm_cookie *ipc, struct rtable *rt,
+		   unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sk_buff *skb;
+
+	struct ip_options *opt = NULL;
+	int hh_len;
+	int exthdrlen;
+	int mtu;
+	int copy;
+	int err;
+	int offset = 0;
+	unsigned int maxfraglen, fragheaderlen;
+	int csummode = CHECKSUM_NONE;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->sk_write_queue)) {
+		/*
+		 * setup for corking.
+		 */
+		opt = ipc->opt;
+		if (opt) {
+			if (inet->cork.opt == NULL) {
+				inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
+				if (unlikely(inet->cork.opt == NULL))
+					return -ENOBUFS;
+			}
+			memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
+			inet->cork.flags |= IPCORK_OPT;
+			inet->cork.addr = ipc->addr;
+		}
+		dst_hold(&rt->u.dst);
+		inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+		inet->cork.rt = rt;
+		inet->cork.length = 0;
+		sk->sk_sndmsg_page = NULL;
+		sk->sk_sndmsg_off = 0;
+		if ((exthdrlen = rt->u.dst.header_len) != 0) {
+			length += exthdrlen;
+			transhdrlen += exthdrlen;
+		}
+	} else {
+		rt = inet->cork.rt;
+		if (inet->cork.flags & IPCORK_OPT)
+			opt = inet->cork.opt;
+
+		transhdrlen = 0;
+		exthdrlen = 0;
+		mtu = inet->cork.fragsize;
+	}
+	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+		return -EMSGSIZE;
+	}
+
+	/*
+	 * transhdrlen > 0 means that this is the first fragment and we wish
+	 * it won't be fragmented in the future.
+	 */
+	if (transhdrlen &&
+	    length + fragheaderlen <= mtu &&
+	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
+	    !exthdrlen)
+		csummode = CHECKSUM_HW;
+
+	inet->cork.length += length;
+
+	/* So, what's going on in the loop below?
+	 *
+	 * We use calculated fragment length to generate chained skb,
+	 * each of segments is IP fragment ready for sending to network after
+	 * adding appropriate IP header.
+	 */
+
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+		goto alloc_new_skb;
+
+	while (length > 0) {
+		/* Check if the remaining data fits into current packet. */
+		copy = mtu - skb->len;
+		if (copy < length)
+			copy = maxfraglen - skb->len;
+		if (copy <= 0) {
+			char *data;
+			unsigned int datalen;
+			unsigned int fraglen;
+			unsigned int fraggap;
+			unsigned int alloclen;
+			struct sk_buff *skb_prev;
+alloc_new_skb:
+			skb_prev = skb;
+			if (skb_prev)
+				fraggap = skb_prev->len - maxfraglen;
+			else
+				fraggap = 0;
+
+			/*
+			 * If remaining data exceeds the mtu,
+			 * we know we need more fragment(s).
+			 */
+			datalen = length + fraggap;
+			if (datalen > mtu - fragheaderlen)
+				datalen = maxfraglen - fragheaderlen;
+			fraglen = datalen + fragheaderlen;
+
+			if ((flags & MSG_MORE) && 
+			    !(rt->u.dst.dev->features&NETIF_F_SG))
+				alloclen = mtu;
+			else
+				alloclen = datalen + fragheaderlen;
+
+			/* The last fragment gets additional space at tail.
+			 * Note, with MSG_MORE we overallocate on fragments,
+			 * because we have no idea what fragment will be
+			 * the last.
+			 */
+			if (datalen == length)
+				alloclen += rt->u.dst.trailer_len;
+
+			if (transhdrlen) {
+				skb = sock_alloc_send_skb(sk, 
+						alloclen + hh_len + 15,
+						(flags & MSG_DONTWAIT), &err);
+			} else {
+				skb = NULL;
+				if (atomic_read(&sk->sk_wmem_alloc) <=
+				    2 * sk->sk_sndbuf)
+					skb = sock_wmalloc(sk, 
+							   alloclen + hh_len + 15, 1,
+							   sk->sk_allocation);
+				if (unlikely(skb == NULL))
+					err = -ENOBUFS;
+			}
+			if (skb == NULL)
+				goto error;
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = csummode;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			data = skb_put(skb, fraglen);
+			skb->nh.raw = data + exthdrlen;
+			data += fragheaderlen;
+			skb->h.raw = data + exthdrlen;
+
+			if (fraggap) {
+				skb->csum = skb_copy_and_csum_bits(
+					skb_prev, maxfraglen,
+					data + transhdrlen, fraggap, 0);
+				skb_prev->csum = csum_sub(skb_prev->csum,
+							  skb->csum);
+				data += fraggap;
+				skb_trim(skb_prev, maxfraglen);
+			}
+
+			copy = datalen - transhdrlen - fraggap;
+			if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+				err = -EFAULT;
+				kfree_skb(skb);
+				goto error;
+			}
+
+			offset += copy;
+			length -= datalen - fraggap;
+			transhdrlen = 0;
+			exthdrlen = 0;
+			csummode = CHECKSUM_NONE;
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			__skb_queue_tail(&sk->sk_write_queue, skb);
+			continue;
+		}
+
+		if (copy > length)
+			copy = length;
+
+		if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
+			unsigned int off;
+
+			off = skb->len;
+			if (getfrag(from, skb_put(skb, copy), 
+					offset, copy, off, skb) < 0) {
+				__skb_trim(skb, off);
+				err = -EFAULT;
+				goto error;
+			}
+		} else {
+			int i = skb_shinfo(skb)->nr_frags;
+			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
+			struct page *page = sk->sk_sndmsg_page;
+			int off = sk->sk_sndmsg_off;
+			unsigned int left;
+
+			if (page && (left = PAGE_SIZE - off) > 0) {
+				if (copy >= left)
+					copy = left;
+				if (page != frag->page) {
+					if (i == MAX_SKB_FRAGS) {
+						err = -EMSGSIZE;
+						goto error;
+					}
+					get_page(page);
+	 				skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
+					frag = &skb_shinfo(skb)->frags[i];
+				}
+			} else if (i < MAX_SKB_FRAGS) {
+				if (copy > PAGE_SIZE)
+					copy = PAGE_SIZE;
+				page = alloc_pages(sk->sk_allocation, 0);
+				if (page == NULL)  {
+					err = -ENOMEM;
+					goto error;
+				}
+				sk->sk_sndmsg_page = page;
+				sk->sk_sndmsg_off = 0;
+
+				skb_fill_page_desc(skb, i, page, 0, 0);
+				frag = &skb_shinfo(skb)->frags[i];
+				skb->truesize += PAGE_SIZE;
+				atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc);
+			} else {
+				err = -EMSGSIZE;
+				goto error;
+			}
+			if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
+				err = -EFAULT;
+				goto error;
+			}
+			sk->sk_sndmsg_off += copy;
+			frag->size += copy;
+			skb->len += copy;
+			skb->data_len += copy;
+		}
+		offset += copy;
+		length -= copy;
+	}
+
+	return 0;
+
+error:
+	inet->cork.length -= length;
+	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+	return err; 
+}
+
+ssize_t	ip_append_page(struct sock *sk, struct page *page,
+		       int offset, size_t size, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sk_buff *skb;
+	struct rtable *rt;
+	struct ip_options *opt = NULL;
+	int hh_len;
+	int mtu;
+	int len;
+	int err;
+	unsigned int maxfraglen, fragheaderlen, fraggap;
+
+	if (inet->hdrincl)
+		return -EPERM;
+
+	if (flags&MSG_PROBE)
+		return 0;
+
+	if (skb_queue_empty(&sk->sk_write_queue))
+		return -EINVAL;
+
+	rt = inet->cork.rt;
+	if (inet->cork.flags & IPCORK_OPT)
+		opt = inet->cork.opt;
+
+	if (!(rt->u.dst.dev->features&NETIF_F_SG))
+		return -EOPNOTSUPP;
+
+	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+	mtu = inet->cork.fragsize;
+
+	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
+
+	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+		return -EMSGSIZE;
+	}
+
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
+		return -EINVAL;
+
+	inet->cork.length += size;
+
+	while (size > 0) {
+		int i;
+
+		/* Check if the remaining data fits into current packet. */
+		len = mtu - skb->len;
+		if (len < size)
+			len = maxfraglen - skb->len;
+		if (len <= 0) {
+			struct sk_buff *skb_prev;
+			char *data;
+			struct iphdr *iph;
+			int alloclen;
+
+			skb_prev = skb;
+			if (skb_prev)
+				fraggap = skb_prev->len - maxfraglen;
+			else
+				fraggap = 0;
+
+			alloclen = fragheaderlen + hh_len + fraggap + 15;
+			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
+			if (unlikely(!skb)) {
+				err = -ENOBUFS;
+				goto error;
+			}
+
+			/*
+			 *	Fill in the control structures
+			 */
+			skb->ip_summed = CHECKSUM_NONE;
+			skb->csum = 0;
+			skb_reserve(skb, hh_len);
+
+			/*
+			 *	Find where to start putting bytes.
+			 */
+			data = skb_put(skb, fragheaderlen + fraggap);
+			skb->nh.iph = iph = (struct iphdr *)data;
+			data += fragheaderlen;
+			skb->h.raw = data;
+
+			if (fraggap) {
+				skb->csum = skb_copy_and_csum_bits(
+					skb_prev, maxfraglen,
+					data, fraggap, 0);
+				skb_prev->csum = csum_sub(skb_prev->csum,
+							  skb->csum);
+				skb_trim(skb_prev, maxfraglen);
+			}
+
+			/*
+			 * Put the packet on the pending queue.
+			 */
+			__skb_queue_tail(&sk->sk_write_queue, skb);
+			continue;
+		}
+
+		i = skb_shinfo(skb)->nr_frags;
+		if (len > size)
+			len = size;
+		if (skb_can_coalesce(skb, i, page, offset)) {
+			skb_shinfo(skb)->frags[i-1].size += len;
+		} else if (i < MAX_SKB_FRAGS) {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, len);
+		} else {
+			err = -EMSGSIZE;
+			goto error;
+		}
+
+		if (skb->ip_summed == CHECKSUM_NONE) {
+			unsigned int csum;
+			csum = csum_page(page, offset, len);
+			skb->csum = csum_block_add(skb->csum, csum, skb->len);
+		}
+
+		skb->len += len;
+		skb->data_len += len;
+		offset += len;
+		size -= len;
+	}
+	return 0;
+
+error:
+	inet->cork.length -= size;
+	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+	return err;
+}
+
+/*
+ *	Combined all pending IP fragments on the socket as one IP datagram
+ *	and push them out.
+ */
+int ip_push_pending_frames(struct sock *sk)
+{
+	struct sk_buff *skb, *tmp_skb;
+	struct sk_buff **tail_skb;
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_options *opt = NULL;
+	struct rtable *rt = inet->cork.rt;
+	struct iphdr *iph;
+	int df = 0;
+	__u8 ttl;
+	int err = 0;
+
+	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
+		goto out;
+	tail_skb = &(skb_shinfo(skb)->frag_list);
+
+	/* move skb->data to ip header from ext header */
+	if (skb->data < skb->nh.raw)
+		__skb_pull(skb, skb->nh.raw - skb->data);
+	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
+		__skb_pull(tmp_skb, skb->h.raw - skb->nh.raw);
+		*tail_skb = tmp_skb;
+		tail_skb = &(tmp_skb->next);
+		skb->len += tmp_skb->len;
+		skb->data_len += tmp_skb->len;
+		skb->truesize += tmp_skb->truesize;
+		__sock_put(tmp_skb->sk);
+		tmp_skb->destructor = NULL;
+		tmp_skb->sk = NULL;
+	}
+
+	/* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
+	 * to fragment the frame generated here. No matter, what transforms
+	 * how transforms change size of the packet, it will come out.
+	 */
+	if (inet->pmtudisc != IP_PMTUDISC_DO)
+		skb->local_df = 1;
+
+	/* DF bit is set when we want to see DF on outgoing frames.
+	 * If local_df is set too, we still allow to fragment this frame
+	 * locally. */
+	if (inet->pmtudisc == IP_PMTUDISC_DO ||
+	    (skb->len <= dst_mtu(&rt->u.dst) &&
+	     ip_dont_fragment(sk, &rt->u.dst)))
+		df = htons(IP_DF);
+
+	if (inet->cork.flags & IPCORK_OPT)
+		opt = inet->cork.opt;
+
+	if (rt->rt_type == RTN_MULTICAST)
+		ttl = inet->mc_ttl;
+	else
+		ttl = ip_select_ttl(inet, &rt->u.dst);
+
+	iph = (struct iphdr *)skb->data;
+	iph->version = 4;
+	iph->ihl = 5;
+	if (opt) {
+		iph->ihl += opt->optlen>>2;
+		ip_options_build(skb, opt, inet->cork.addr, rt, 0);
+	}
+	iph->tos = inet->tos;
+	iph->tot_len = htons(skb->len);
+	iph->frag_off = df;
+	if (!df) {
+		__ip_select_ident(iph, &rt->u.dst, 0);
+	} else {
+		iph->id = htons(inet->id++);
+	}
+	iph->ttl = ttl;
+	iph->protocol = sk->sk_protocol;
+	iph->saddr = rt->rt_src;
+	iph->daddr = rt->rt_dst;
+	ip_send_check(iph);
+
+	skb->priority = sk->sk_priority;
+	skb->dst = dst_clone(&rt->u.dst);
+
+	/* Netfilter gets whole the not fragmented skb. */
+	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, 
+		      skb->dst->dev, dst_output);
+	if (err) {
+		if (err > 0)
+			err = inet->recverr ? net_xmit_errno(err) : 0;
+		if (err)
+			goto error;
+	}
+
+out:
+	inet->cork.flags &= ~IPCORK_OPT;
+	if (inet->cork.opt) {
+		kfree(inet->cork.opt);
+		inet->cork.opt = NULL;
+	}
+	if (inet->cork.rt) {
+		ip_rt_put(inet->cork.rt);
+		inet->cork.rt = NULL;
+	}
+	return err;
+
+error:
+	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+	goto out;
+}
+
+/*
+ *	Throw away all pending data on the socket.
+ */
+void ip_flush_pending_frames(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
+		kfree_skb(skb);
+
+	inet->cork.flags &= ~IPCORK_OPT;
+	if (inet->cork.opt) {
+		kfree(inet->cork.opt);
+		inet->cork.opt = NULL;
+	}
+	if (inet->cork.rt) {
+		ip_rt_put(inet->cork.rt);
+		inet->cork.rt = NULL;
+	}
+}
+
+
+/*
+ *	Fetch data from kernel space and fill in checksum if needed.
+ */
+static int ip_reply_glue_bits(void *dptr, char *to, int offset, 
+			      int len, int odd, struct sk_buff *skb)
+{
+	unsigned int csum;
+
+	csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
+	skb->csum = csum_block_add(skb->csum, csum, odd);
+	return 0;  
+}
+
+/* 
+ *	Generic function to send a packet as reply to another packet.
+ *	Used to send TCP resets so far. ICMP should use this function too.
+ *
+ *	Should run single threaded per socket because it uses the sock 
+ *     	structure to pass arguments.
+ *
+ *	LATER: switch from ip_build_xmit to ip_append_*
+ */
+void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
+		   unsigned int len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct {
+		struct ip_options	opt;
+		char			data[40];
+	} replyopts;
+	struct ipcm_cookie ipc;
+	u32 daddr;
+	struct rtable *rt = (struct rtable*)skb->dst;
+
+	if (ip_options_echo(&replyopts.opt, skb))
+		return;
+
+	daddr = ipc.addr = rt->rt_src;
+	ipc.opt = NULL;
+
+	if (replyopts.opt.optlen) {
+		ipc.opt = &replyopts.opt;
+
+		if (ipc.opt->srr)
+			daddr = replyopts.opt.faddr;
+	}
+
+	{
+		struct flowi fl = { .nl_u = { .ip4_u =
+					      { .daddr = daddr,
+						.saddr = rt->rt_spec_dst,
+						.tos = RT_TOS(skb->nh.iph->tos) } },
+				    /* Not quite clean, but right. */
+				    .uli_u = { .ports =
+					       { .sport = skb->h.th->dest,
+					         .dport = skb->h.th->source } },
+				    .proto = sk->sk_protocol };
+		if (ip_route_output_key(&rt, &fl))
+			return;
+	}
+
+	/* And let IP do all the hard work.
+
+	   This chunk is not reenterable, hence spinlock.
+	   Note that it uses the fact, that this function is called
+	   with locally disabled BH and that sk cannot be already spinlocked.
+	 */
+	bh_lock_sock(sk);
+	inet->tos = skb->nh.iph->tos;
+	sk->sk_priority = skb->priority;
+	sk->sk_protocol = skb->nh.iph->protocol;
+	ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
+		       &ipc, rt, MSG_DONTWAIT);
+	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
+		if (arg->csumoffset >= 0)
+			*((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum));
+		skb->ip_summed = CHECKSUM_NONE;
+		ip_push_pending_frames(sk);
+	}
+
+	bh_unlock_sock(sk);
+
+	ip_rt_put(rt);
+}
+
+/*
+ *	IP protocol layer initialiser
+ */
+
+static struct packet_type ip_packet_type = {
+	.type = __constant_htons(ETH_P_IP),
+	.func = ip_rcv,
+};
+
+/*
+ *	IP registers the packet type and then calls the subprotocol initialisers
+ */
+
+void __init ip_init(void)
+{
+	dev_add_pack(&ip_packet_type);
+
+	ip_rt_init();
+	inet_initpeers();
+
+#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
+	igmp_mc_proc_init();
+#endif
+}
+
+EXPORT_SYMBOL(ip_finish_output);
+EXPORT_SYMBOL(ip_fragment);
+EXPORT_SYMBOL(ip_generic_getfrag);
+EXPORT_SYMBOL(ip_queue_xmit);
+EXPORT_SYMBOL(ip_send_check);
+
+#ifdef CONFIG_SYSCTL
+EXPORT_SYMBOL(sysctl_ip_default_ttl);
+#endif
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
new file mode 100644
index 000000000000..47012b93cad2
--- /dev/null
+++ b/net/ipv4/ip_sockglue.c
@@ -0,0 +1,1093 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The IP to API glue.
+ *		
+ * Version:	$Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	see ip.c
+ *
+ * Fixes:
+ *		Many		:	Split from ip.c , see ip.c for history.
+ *		Martin Mares	:	TOS setting fixed.
+ *		Alan Cox	:	Fixed a couple of oopses in Martin's 
+ *					TOS tweaks.
+ *		Mike McLagan	:	Routing by source
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/netdevice.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/igmp.h>
+#include <linux/netfilter.h>
+#include <linux/route.h>
+#include <linux/mroute.h>
+#include <net/route.h>
+#include <net/xfrm.h>
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#include <net/transp_v6.h>
+#endif
+
+#include <linux/errqueue.h>
+#include <asm/uaccess.h>
+
+#define IP_CMSG_PKTINFO		1
+#define IP_CMSG_TTL		2
+#define IP_CMSG_TOS		4
+#define IP_CMSG_RECVOPTS	8
+#define IP_CMSG_RETOPTS		16
+
+/*
+ *	SOL_IP control messages.
+ */
+
+static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct in_pktinfo info;
+	struct rtable *rt = (struct rtable *)skb->dst;
+
+	info.ipi_addr.s_addr = skb->nh.iph->daddr;
+	if (rt) {
+		info.ipi_ifindex = rt->rt_iif;
+		info.ipi_spec_dst.s_addr = rt->rt_spec_dst;
+	} else {
+		info.ipi_ifindex = 0;
+		info.ipi_spec_dst.s_addr = 0;
+	}
+
+	put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+}
+
+static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb)
+{
+	int ttl = skb->nh.iph->ttl;
+	put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl);
+}
+
+static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb)
+{
+	put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos);
+}
+
+static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb)
+{
+	if (IPCB(skb)->opt.optlen == 0)
+		return;
+
+	put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1);
+}
+
+
+static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb)
+{
+	unsigned char optbuf[sizeof(struct ip_options) + 40];
+	struct ip_options * opt = (struct ip_options*)optbuf;
+
+	if (IPCB(skb)->opt.optlen == 0)
+		return;
+
+	if (ip_options_echo(opt, skb)) {
+		msg->msg_flags |= MSG_CTRUNC;
+		return;
+	}
+	ip_options_undo(opt);
+
+	put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data);
+}
+
+
+void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb)
+{
+	struct inet_sock *inet = inet_sk(skb->sk);
+	unsigned flags = inet->cmsg_flags;
+
+	/* Ordered by supposed usage frequency */
+	if (flags & 1)
+		ip_cmsg_recv_pktinfo(msg, skb);
+	if ((flags>>=1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_ttl(msg, skb);
+	if ((flags>>=1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_tos(msg, skb);
+	if ((flags>>=1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_opts(msg, skb);
+	if ((flags>>=1) == 0)
+		return;
+
+	if (flags & 1)
+		ip_cmsg_recv_retopts(msg, skb);
+}
+
+int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc)
+{
+	int err;
+	struct cmsghdr *cmsg;
+
+	for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
+		if (!CMSG_OK(msg, cmsg))
+			return -EINVAL;
+		if (cmsg->cmsg_level != SOL_IP)
+			continue;
+		switch (cmsg->cmsg_type) {
+		case IP_RETOPTS:
+			err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+			err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0);
+			if (err)
+				return err;
+			break;
+		case IP_PKTINFO:
+		{
+			struct in_pktinfo *info;
+			if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo)))
+				return -EINVAL;
+			info = (struct in_pktinfo *)CMSG_DATA(cmsg);
+			ipc->oif = info->ipi_ifindex;
+			ipc->addr = info->ipi_spec_dst.s_addr;
+			break;
+		}
+		default:
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+
+/* Special input handler for packets caught by router alert option.
+   They are selected only by protocol field, and then processed likely
+   local ones; but only if someone wants them! Otherwise, router
+   not running rsvpd will kill RSVP.
+
+   It is user level problem, what it will make with them.
+   I have no idea, how it will masquearde or NAT them (it is joke, joke :-)),
+   but receiver should be enough clever f.e. to forward mtrace requests,
+   sent to multicast group to reach destination designated router.
+ */
+struct ip_ra_chain *ip_ra_chain;
+DEFINE_RWLOCK(ip_ra_lock);
+
+int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *))
+{
+	struct ip_ra_chain *ra, *new_ra, **rap;
+
+	if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+		return -EINVAL;
+
+	new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
+
+	write_lock_bh(&ip_ra_lock);
+	for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) {
+		if (ra->sk == sk) {
+			if (on) {
+				write_unlock_bh(&ip_ra_lock);
+				if (new_ra)
+					kfree(new_ra);
+				return -EADDRINUSE;
+			}
+			*rap = ra->next;
+			write_unlock_bh(&ip_ra_lock);
+
+			if (ra->destructor)
+				ra->destructor(sk);
+			sock_put(sk);
+			kfree(ra);
+			return 0;
+		}
+	}
+	if (new_ra == NULL) {
+		write_unlock_bh(&ip_ra_lock);
+		return -ENOBUFS;
+	}
+	new_ra->sk = sk;
+	new_ra->destructor = destructor;
+
+	new_ra->next = ra;
+	*rap = new_ra;
+	sock_hold(sk);
+	write_unlock_bh(&ip_ra_lock);
+
+	return 0;
+}
+
+void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, 
+		   u16 port, u32 info, u8 *payload)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sock_exterr_skb *serr;
+
+	if (!inet->recverr)
+		return;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	serr = SKB_EXT_ERR(skb);  
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_ICMP;
+	serr->ee.ee_type = skb->h.icmph->type; 
+	serr->ee.ee_code = skb->h.icmph->code;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw;
+	serr->port = port;
+
+	skb->h.raw = payload;
+	if (!skb_pull(skb, payload - skb->data) ||
+	    sock_queue_err_skb(sk, skb))
+		kfree_skb(skb);
+}
+
+void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sock_exterr_skb *serr;
+	struct iphdr *iph;
+	struct sk_buff *skb;
+
+	if (!inet->recverr)
+		return;
+
+	skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC);
+	if (!skb)
+		return;
+
+	iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr));
+	skb->nh.iph = iph;
+	iph->daddr = daddr;
+
+	serr = SKB_EXT_ERR(skb);  
+	serr->ee.ee_errno = err;
+	serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL;
+	serr->ee.ee_type = 0; 
+	serr->ee.ee_code = 0;
+	serr->ee.ee_pad = 0;
+	serr->ee.ee_info = info;
+	serr->ee.ee_data = 0;
+	serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw;
+	serr->port = port;
+
+	skb->h.raw = skb->tail;
+	__skb_pull(skb, skb->tail - skb->data);
+
+	if (sock_queue_err_skb(sk, skb))
+		kfree_skb(skb);
+}
+
+/* 
+ *	Handle MSG_ERRQUEUE
+ */
+int ip_recv_error(struct sock *sk, struct msghdr *msg, int len)
+{
+	struct sock_exterr_skb *serr;
+	struct sk_buff *skb, *skb2;
+	struct sockaddr_in *sin;
+	struct {
+		struct sock_extended_err ee;
+		struct sockaddr_in	 offender;
+	} errhdr;
+	int err;
+	int copied;
+
+	err = -EAGAIN;
+	skb = skb_dequeue(&sk->sk_error_queue);
+	if (skb == NULL)
+		goto out;
+
+	copied = skb->len;
+	if (copied > len) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto out_free_skb;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	serr = SKB_EXT_ERR(skb);
+
+	sin = (struct sockaddr_in *)msg->msg_name;
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset);
+		sin->sin_port = serr->port;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+
+	memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err));
+	sin = &errhdr.offender;
+	sin->sin_family = AF_UNSPEC;
+	if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = skb->nh.iph->saddr;
+		sin->sin_port = 0;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+		if (inet->cmsg_flags)
+			ip_cmsg_recv(msg, skb);
+	}
+
+	put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr);
+
+	/* Now we could try to dump offended packet options */
+
+	msg->msg_flags |= MSG_ERRQUEUE;
+	err = copied;
+
+	/* Reset and regenerate socket error */
+	spin_lock_irq(&sk->sk_error_queue.lock);
+	sk->sk_err = 0;
+	if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) {
+		sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno;
+		spin_unlock_irq(&sk->sk_error_queue.lock);
+		sk->sk_error_report(sk);
+	} else
+		spin_unlock_irq(&sk->sk_error_queue.lock);
+
+out_free_skb:	
+	kfree_skb(skb);
+out:
+	return err;
+}
+
+
+/*
+ *	Socket option code for IP. This is the end of the line after any TCP,UDP etc options on
+ *	an IP socket.
+ */
+
+int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int val=0,err;
+
+	if (level != SOL_IP)
+		return -ENOPROTOOPT;
+
+	if (((1<<optname) & ((1<<IP_PKTINFO) | (1<<IP_RECVTTL) | 
+			    (1<<IP_RECVOPTS) | (1<<IP_RECVTOS) | 
+			    (1<<IP_RETOPTS) | (1<<IP_TOS) | 
+			    (1<<IP_TTL) | (1<<IP_HDRINCL) | 
+			    (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | 
+			    (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND))) || 
+				optname == IP_MULTICAST_TTL || 
+				optname == IP_MULTICAST_LOOP) { 
+		if (optlen >= sizeof(int)) {
+			if (get_user(val, (int __user *) optval))
+				return -EFAULT;
+		} else if (optlen >= sizeof(char)) {
+			unsigned char ucval;
+
+			if (get_user(ucval, (unsigned char __user *) optval))
+				return -EFAULT;
+			val = (int) ucval;
+		}
+	}
+
+	/* If optlen==0, it is equivalent to val == 0 */
+
+#ifdef CONFIG_IP_MROUTE
+	if (optname >= MRT_BASE && optname <= (MRT_BASE + 10))
+		return ip_mroute_setsockopt(sk,optname,optval,optlen);
+#endif
+
+	err = 0;
+	lock_sock(sk);
+
+	switch (optname) {
+		case IP_OPTIONS:
+		{
+			struct ip_options * opt = NULL;
+			if (optlen > 40 || optlen < 0)
+				goto e_inval;
+			err = ip_options_get(&opt, optval, optlen, 1);
+			if (err)
+				break;
+			if (sk->sk_type == SOCK_STREAM) {
+				struct tcp_sock *tp = tcp_sk(sk);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+				if (sk->sk_family == PF_INET ||
+				    (!((1 << sk->sk_state) &
+				       (TCPF_LISTEN | TCPF_CLOSE)) &&
+				     inet->daddr != LOOPBACK4_IPV6)) {
+#endif
+					if (inet->opt)
+						tp->ext_header_len -= inet->opt->optlen;
+					if (opt)
+						tp->ext_header_len += opt->optlen;
+					tcp_sync_mss(sk, tp->pmtu_cookie);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+				}
+#endif
+			}
+			opt = xchg(&inet->opt, opt);
+			if (opt)
+				kfree(opt);
+			break;
+		}
+		case IP_PKTINFO:
+			if (val)
+				inet->cmsg_flags |= IP_CMSG_PKTINFO;
+			else
+				inet->cmsg_flags &= ~IP_CMSG_PKTINFO;
+			break;
+		case IP_RECVTTL:
+			if (val)
+				inet->cmsg_flags |=  IP_CMSG_TTL;
+			else
+				inet->cmsg_flags &= ~IP_CMSG_TTL;
+			break;
+		case IP_RECVTOS:
+			if (val)
+				inet->cmsg_flags |=  IP_CMSG_TOS;
+			else
+				inet->cmsg_flags &= ~IP_CMSG_TOS;
+			break;
+		case IP_RECVOPTS:
+			if (val)
+				inet->cmsg_flags |=  IP_CMSG_RECVOPTS;
+			else
+				inet->cmsg_flags &= ~IP_CMSG_RECVOPTS;
+			break;
+		case IP_RETOPTS:
+			if (val)
+				inet->cmsg_flags |= IP_CMSG_RETOPTS;
+			else
+				inet->cmsg_flags &= ~IP_CMSG_RETOPTS;
+			break;
+		case IP_TOS:	/* This sets both TOS and Precedence */
+			if (sk->sk_type == SOCK_STREAM) {
+				val &= ~3;
+				val |= inet->tos & 3;
+			}
+			if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && 
+			    !capable(CAP_NET_ADMIN)) {
+				err = -EPERM;
+				break;
+			}
+			if (inet->tos != val) {
+				inet->tos = val;
+				sk->sk_priority = rt_tos2priority(val);
+				sk_dst_reset(sk); 
+			}
+			break;
+		case IP_TTL:
+			if (optlen<1)
+				goto e_inval;
+			if (val != -1 && (val < 1 || val>255))
+				goto e_inval;
+			inet->uc_ttl = val;
+			break;
+		case IP_HDRINCL:
+			if (sk->sk_type != SOCK_RAW) {
+				err = -ENOPROTOOPT;
+				break;
+			}
+			inet->hdrincl = val ? 1 : 0;
+			break;
+		case IP_MTU_DISCOVER:
+			if (val<0 || val>2)
+				goto e_inval;
+			inet->pmtudisc = val;
+			break;
+		case IP_RECVERR:
+			inet->recverr = !!val;
+			if (!val)
+				skb_queue_purge(&sk->sk_error_queue);
+			break;
+		case IP_MULTICAST_TTL:
+			if (sk->sk_type == SOCK_STREAM)
+				goto e_inval;
+			if (optlen<1)
+				goto e_inval;
+			if (val==-1)
+				val = 1;
+			if (val < 0 || val > 255)
+				goto e_inval;
+			inet->mc_ttl = val;
+	                break;
+		case IP_MULTICAST_LOOP: 
+			if (optlen<1)
+				goto e_inval;
+			inet->mc_loop = !!val;
+	                break;
+		case IP_MULTICAST_IF: 
+		{
+			struct ip_mreqn mreq;
+			struct net_device *dev = NULL;
+
+			if (sk->sk_type == SOCK_STREAM)
+				goto e_inval;
+			/*
+			 *	Check the arguments are allowable
+			 */
+
+			err = -EFAULT;
+			if (optlen >= sizeof(struct ip_mreqn)) {
+				if (copy_from_user(&mreq,optval,sizeof(mreq)))
+					break;
+			} else {
+				memset(&mreq, 0, sizeof(mreq));
+				if (optlen >= sizeof(struct in_addr) &&
+				    copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr)))
+					break;
+			}
+
+			if (!mreq.imr_ifindex) {
+				if (mreq.imr_address.s_addr == INADDR_ANY) {
+					inet->mc_index = 0;
+					inet->mc_addr  = 0;
+					err = 0;
+					break;
+				}
+				dev = ip_dev_find(mreq.imr_address.s_addr);
+				if (dev) {
+					mreq.imr_ifindex = dev->ifindex;
+					dev_put(dev);
+				}
+			} else
+				dev = __dev_get_by_index(mreq.imr_ifindex);
+
+
+			err = -EADDRNOTAVAIL;
+			if (!dev)
+				break;
+
+			err = -EINVAL;
+			if (sk->sk_bound_dev_if &&
+			    mreq.imr_ifindex != sk->sk_bound_dev_if)
+				break;
+
+			inet->mc_index = mreq.imr_ifindex;
+			inet->mc_addr  = mreq.imr_address.s_addr;
+			err = 0;
+			break;
+		}
+
+		case IP_ADD_MEMBERSHIP:
+		case IP_DROP_MEMBERSHIP: 
+		{
+			struct ip_mreqn mreq;
+
+			if (optlen < sizeof(struct ip_mreq))
+				goto e_inval;
+			err = -EFAULT;
+			if (optlen >= sizeof(struct ip_mreqn)) {
+				if(copy_from_user(&mreq,optval,sizeof(mreq)))
+					break;
+			} else {
+				memset(&mreq, 0, sizeof(mreq));
+				if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq)))
+					break; 
+			}
+
+			if (optname == IP_ADD_MEMBERSHIP)
+				err = ip_mc_join_group(sk, &mreq);
+			else
+				err = ip_mc_leave_group(sk, &mreq);
+			break;
+		}
+		case IP_MSFILTER:
+		{
+			extern int sysctl_optmem_max;
+			extern int sysctl_igmp_max_msf;
+			struct ip_msfilter *msf;
+
+			if (optlen < IP_MSFILTER_SIZE(0))
+				goto e_inval;
+			if (optlen > sysctl_optmem_max) {
+				err = -ENOBUFS;
+				break;
+			}
+			msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL);
+			if (msf == 0) {
+				err = -ENOBUFS;
+				break;
+			}
+			err = -EFAULT;
+			if (copy_from_user(msf, optval, optlen)) {
+				kfree(msf);
+				break;
+			}
+			/* numsrc >= (1G-4) overflow in 32 bits */
+			if (msf->imsf_numsrc >= 0x3ffffffcU ||
+			    msf->imsf_numsrc > sysctl_igmp_max_msf) {
+				kfree(msf);
+				err = -ENOBUFS;
+				break;
+			}
+			if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) {
+				kfree(msf);
+				err = -EINVAL;
+				break;
+			}
+			err = ip_mc_msfilter(sk, msf, 0);
+			kfree(msf);
+			break;
+		}
+		case IP_BLOCK_SOURCE:
+		case IP_UNBLOCK_SOURCE:
+		case IP_ADD_SOURCE_MEMBERSHIP:
+		case IP_DROP_SOURCE_MEMBERSHIP:
+		{
+			struct ip_mreq_source mreqs;
+			int omode, add;
+
+			if (optlen != sizeof(struct ip_mreq_source))
+				goto e_inval;
+			if (copy_from_user(&mreqs, optval, sizeof(mreqs))) {
+				err = -EFAULT;
+				break;
+			}
+			if (optname == IP_BLOCK_SOURCE) {
+				omode = MCAST_EXCLUDE;
+				add = 1;
+			} else if (optname == IP_UNBLOCK_SOURCE) {
+				omode = MCAST_EXCLUDE;
+				add = 0;
+			} else if (optname == IP_ADD_SOURCE_MEMBERSHIP) {
+				struct ip_mreqn mreq;
+
+				mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
+				mreq.imr_address.s_addr = mreqs.imr_interface;
+				mreq.imr_ifindex = 0;
+				err = ip_mc_join_group(sk, &mreq);
+				if (err)
+					break;
+				omode = MCAST_INCLUDE;
+				add = 1;
+			} else /*IP_DROP_SOURCE_MEMBERSHIP */ {
+				omode = MCAST_INCLUDE;
+				add = 0;
+			}
+			err = ip_mc_source(add, omode, sk, &mreqs, 0);
+			break;
+		}
+		case MCAST_JOIN_GROUP:
+		case MCAST_LEAVE_GROUP: 
+		{
+			struct group_req greq;
+			struct sockaddr_in *psin;
+			struct ip_mreqn mreq;
+
+			if (optlen < sizeof(struct group_req))
+				goto e_inval;
+			err = -EFAULT;
+			if(copy_from_user(&greq, optval, sizeof(greq)))
+				break;
+			psin = (struct sockaddr_in *)&greq.gr_group;
+			if (psin->sin_family != AF_INET)
+				goto e_inval;
+			memset(&mreq, 0, sizeof(mreq));
+			mreq.imr_multiaddr = psin->sin_addr;
+			mreq.imr_ifindex = greq.gr_interface;
+
+			if (optname == MCAST_JOIN_GROUP)
+				err = ip_mc_join_group(sk, &mreq);
+			else
+				err = ip_mc_leave_group(sk, &mreq);
+			break;
+		}
+		case MCAST_JOIN_SOURCE_GROUP:
+		case MCAST_LEAVE_SOURCE_GROUP:
+		case MCAST_BLOCK_SOURCE:
+		case MCAST_UNBLOCK_SOURCE:
+		{
+			struct group_source_req greqs;
+			struct ip_mreq_source mreqs;
+			struct sockaddr_in *psin;
+			int omode, add;
+
+			if (optlen != sizeof(struct group_source_req))
+				goto e_inval;
+			if (copy_from_user(&greqs, optval, sizeof(greqs))) {
+				err = -EFAULT;
+				break;
+			}
+			if (greqs.gsr_group.ss_family != AF_INET ||
+			    greqs.gsr_source.ss_family != AF_INET) {
+				err = -EADDRNOTAVAIL;
+				break;
+			}
+			psin = (struct sockaddr_in *)&greqs.gsr_group;
+			mreqs.imr_multiaddr = psin->sin_addr.s_addr;
+			psin = (struct sockaddr_in *)&greqs.gsr_source;
+			mreqs.imr_sourceaddr = psin->sin_addr.s_addr;
+			mreqs.imr_interface = 0; /* use index for mc_source */
+
+			if (optname == MCAST_BLOCK_SOURCE) {
+				omode = MCAST_EXCLUDE;
+				add = 1;
+			} else if (optname == MCAST_UNBLOCK_SOURCE) {
+				omode = MCAST_EXCLUDE;
+				add = 0;
+			} else if (optname == MCAST_JOIN_SOURCE_GROUP) {
+				struct ip_mreqn mreq;
+
+				psin = (struct sockaddr_in *)&greqs.gsr_group;
+				mreq.imr_multiaddr = psin->sin_addr;
+				mreq.imr_address.s_addr = 0;
+				mreq.imr_ifindex = greqs.gsr_interface;
+				err = ip_mc_join_group(sk, &mreq);
+				if (err)
+					break;
+				greqs.gsr_interface = mreq.imr_ifindex;
+				omode = MCAST_INCLUDE;
+				add = 1;
+			} else /* MCAST_LEAVE_SOURCE_GROUP */ {
+				omode = MCAST_INCLUDE;
+				add = 0;
+			}
+			err = ip_mc_source(add, omode, sk, &mreqs,
+				greqs.gsr_interface);
+			break;
+		}
+		case MCAST_MSFILTER:
+		{
+			extern int sysctl_optmem_max;
+			extern int sysctl_igmp_max_msf;
+			struct sockaddr_in *psin;
+			struct ip_msfilter *msf = NULL;
+			struct group_filter *gsf = NULL;
+			int msize, i, ifindex;
+
+			if (optlen < GROUP_FILTER_SIZE(0))
+				goto e_inval;
+			if (optlen > sysctl_optmem_max) {
+				err = -ENOBUFS;
+				break;
+			}
+			gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL);
+			if (gsf == 0) {
+				err = -ENOBUFS;
+				break;
+			}
+			err = -EFAULT;
+			if (copy_from_user(gsf, optval, optlen)) {
+				goto mc_msf_out;
+			}
+			/* numsrc >= (4G-140)/128 overflow in 32 bits */
+			if (gsf->gf_numsrc >= 0x1ffffff ||
+			    gsf->gf_numsrc > sysctl_igmp_max_msf) {
+				err = -ENOBUFS;
+				goto mc_msf_out;
+			}
+			if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) {
+				err = -EINVAL;
+				goto mc_msf_out;
+			}
+			msize = IP_MSFILTER_SIZE(gsf->gf_numsrc);
+			msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL);
+			if (msf == 0) {
+				err = -ENOBUFS;
+				goto mc_msf_out;
+			}
+			ifindex = gsf->gf_interface;
+			psin = (struct sockaddr_in *)&gsf->gf_group;
+			if (psin->sin_family != AF_INET) {
+				err = -EADDRNOTAVAIL;
+				goto mc_msf_out;
+			}
+			msf->imsf_multiaddr = psin->sin_addr.s_addr;
+			msf->imsf_interface = 0;
+			msf->imsf_fmode = gsf->gf_fmode;
+			msf->imsf_numsrc = gsf->gf_numsrc;
+			err = -EADDRNOTAVAIL;
+			for (i=0; i<gsf->gf_numsrc; ++i) {
+				psin = (struct sockaddr_in *)&gsf->gf_slist[i];
+
+				if (psin->sin_family != AF_INET)
+					goto mc_msf_out;
+				msf->imsf_slist[i] = psin->sin_addr.s_addr;
+			}
+			kfree(gsf);
+			gsf = NULL;
+
+			err = ip_mc_msfilter(sk, msf, ifindex);
+mc_msf_out:
+			if (msf)
+				kfree(msf);
+			if (gsf)
+				kfree(gsf);
+			break;
+		}
+		case IP_ROUTER_ALERT:	
+			err = ip_ra_control(sk, val ? 1 : 0, NULL);
+			break;
+
+		case IP_FREEBIND:
+			if (optlen<1)
+				goto e_inval;
+			inet->freebind = !!val; 
+	                break;			
+ 
+		case IP_IPSEC_POLICY:
+		case IP_XFRM_POLICY:
+			err = xfrm_user_policy(sk, optname, optval, optlen);
+			break;
+
+		default:
+#ifdef CONFIG_NETFILTER
+			err = nf_setsockopt(sk, PF_INET, optname, optval, 
+					    optlen);
+#else
+			err = -ENOPROTOOPT;
+#endif
+			break;
+	}
+	release_sock(sk);
+	return err;
+
+e_inval:
+	release_sock(sk);
+	return -EINVAL;
+}
+
+/*
+ *	Get the options. Note for future reference. The GET of IP options gets the
+ *	_received_ ones. The set sets the _sent_ ones.
+ */
+
+int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int val;
+	int len;
+	
+	if(level!=SOL_IP)
+		return -EOPNOTSUPP;
+
+#ifdef CONFIG_IP_MROUTE
+	if(optname>=MRT_BASE && optname <=MRT_BASE+10)
+	{
+		return ip_mroute_getsockopt(sk,optname,optval,optlen);
+	}
+#endif
+
+	if(get_user(len,optlen))
+		return -EFAULT;
+	if(len < 0)
+		return -EINVAL;
+		
+	lock_sock(sk);
+
+	switch(optname)	{
+		case IP_OPTIONS:
+			{
+				unsigned char optbuf[sizeof(struct ip_options)+40];
+				struct ip_options * opt = (struct ip_options*)optbuf;
+				opt->optlen = 0;
+				if (inet->opt)
+					memcpy(optbuf, inet->opt,
+					       sizeof(struct ip_options)+
+					       inet->opt->optlen);
+				release_sock(sk);
+
+				if (opt->optlen == 0) 
+					return put_user(0, optlen);
+
+				ip_options_undo(opt);
+
+				len = min_t(unsigned int, len, opt->optlen);
+				if(put_user(len, optlen))
+					return -EFAULT;
+				if(copy_to_user(optval, opt->__data, len))
+					return -EFAULT;
+				return 0;
+			}
+		case IP_PKTINFO:
+			val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0;
+			break;
+		case IP_RECVTTL:
+			val = (inet->cmsg_flags & IP_CMSG_TTL) != 0;
+			break;
+		case IP_RECVTOS:
+			val = (inet->cmsg_flags & IP_CMSG_TOS) != 0;
+			break;
+		case IP_RECVOPTS:
+			val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0;
+			break;
+		case IP_RETOPTS:
+			val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0;
+			break;
+		case IP_TOS:
+			val = inet->tos;
+			break;
+		case IP_TTL:
+			val = (inet->uc_ttl == -1 ?
+			       sysctl_ip_default_ttl :
+			       inet->uc_ttl);
+			break;
+		case IP_HDRINCL:
+			val = inet->hdrincl;
+			break;
+		case IP_MTU_DISCOVER:
+			val = inet->pmtudisc;
+			break;
+		case IP_MTU:
+		{
+			struct dst_entry *dst;
+			val = 0;
+			dst = sk_dst_get(sk);
+			if (dst) {
+				val = dst_mtu(dst);
+				dst_release(dst);
+			}
+			if (!val) {
+				release_sock(sk);
+				return -ENOTCONN;
+			}
+			break;
+		}
+		case IP_RECVERR:
+			val = inet->recverr;
+			break;
+		case IP_MULTICAST_TTL:
+			val = inet->mc_ttl;
+			break;
+		case IP_MULTICAST_LOOP:
+			val = inet->mc_loop;
+			break;
+		case IP_MULTICAST_IF:
+		{
+			struct in_addr addr;
+			len = min_t(unsigned int, len, sizeof(struct in_addr));
+			addr.s_addr = inet->mc_addr;
+			release_sock(sk);
+
+  			if(put_user(len, optlen))
+  				return -EFAULT;
+			if(copy_to_user(optval, &addr, len))
+				return -EFAULT;
+			return 0;
+		}
+		case IP_MSFILTER:
+		{
+			struct ip_msfilter msf;
+			int err;
+
+			if (len < IP_MSFILTER_SIZE(0)) {
+				release_sock(sk);
+				return -EINVAL;
+			}
+			if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
+				release_sock(sk);
+				return -EFAULT;
+			}
+			err = ip_mc_msfget(sk, &msf,
+				(struct ip_msfilter __user *)optval, optlen);
+			release_sock(sk);
+			return err;
+		}
+		case MCAST_MSFILTER:
+		{
+			struct group_filter gsf;
+			int err;
+
+			if (len < GROUP_FILTER_SIZE(0)) {
+				release_sock(sk);
+				return -EINVAL;
+			}
+			if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
+				release_sock(sk);
+				return -EFAULT;
+			}
+			err = ip_mc_gsfget(sk, &gsf,
+				(struct group_filter __user *)optval, optlen);
+			release_sock(sk);
+			return err;
+		}
+		case IP_PKTOPTIONS:		
+		{
+			struct msghdr msg;
+
+			release_sock(sk);
+
+			if (sk->sk_type != SOCK_STREAM)
+				return -ENOPROTOOPT;
+
+			msg.msg_control = optval;
+			msg.msg_controllen = len;
+			msg.msg_flags = 0;
+
+			if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
+				struct in_pktinfo info;
+
+				info.ipi_addr.s_addr = inet->rcv_saddr;
+				info.ipi_spec_dst.s_addr = inet->rcv_saddr;
+				info.ipi_ifindex = inet->mc_index;
+				put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
+			}
+			if (inet->cmsg_flags & IP_CMSG_TTL) {
+				int hlim = inet->mc_ttl;
+				put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim);
+			}
+			len -= msg.msg_controllen;
+			return put_user(len, optlen);
+		}
+		case IP_FREEBIND: 
+			val = inet->freebind; 
+			break; 
+		default:
+#ifdef CONFIG_NETFILTER
+			val = nf_getsockopt(sk, PF_INET, optname, optval, 
+					    &len);
+			release_sock(sk);
+			if (val >= 0)
+				val = put_user(len, optlen);
+			return val;
+#else
+			release_sock(sk);
+			return -ENOPROTOOPT;
+#endif
+	}
+	release_sock(sk);
+	
+	if (len < sizeof(int) && len > 0 && val>=0 && val<255) {
+		unsigned char ucval = (unsigned char)val;
+		len = 1;
+		if(put_user(len, optlen))
+			return -EFAULT;
+		if(copy_to_user(optval,&ucval,1))
+			return -EFAULT;
+	} else {
+		len = min_t(unsigned int, sizeof(int), len);
+		if(put_user(len, optlen))
+			return -EFAULT;
+		if(copy_to_user(optval,&val,len))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(ip_cmsg_recv);
+
+#ifdef CONFIG_IP_SCTP_MODULE
+EXPORT_SYMBOL(ip_getsockopt);
+EXPORT_SYMBOL(ip_setsockopt);
+#endif
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
new file mode 100644
index 000000000000..1a23c5263b99
--- /dev/null
+++ b/net/ipv4/ipcomp.c
@@ -0,0 +1,524 @@
+/*
+ * IP Payload Compression Protocol (IPComp) - RFC3173.
+ *
+ * Copyright (c) 2003 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option) 
+ * any later version.
+ *
+ * Todo:
+ *   - Tunable compression parameters.
+ *   - Compression stats.
+ *   - Adaptive compression.
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/scatterlist.h>
+#include <asm/semaphore.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/percpu.h>
+#include <linux/smp.h>
+#include <linux/list.h>
+#include <linux/vmalloc.h>
+#include <linux/rtnetlink.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+#include <net/ipcomp.h>
+
+struct ipcomp_tfms {
+	struct list_head list;
+	struct crypto_tfm **tfms;
+	int users;
+};
+
+static DECLARE_MUTEX(ipcomp_resource_sem);
+static void **ipcomp_scratches;
+static int ipcomp_scratch_users;
+static LIST_HEAD(ipcomp_tfms_list);
+
+static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err, plen, dlen;
+	struct iphdr *iph;
+	struct ipcomp_data *ipcd = x->data;
+	u8 *start, *scratch;
+	struct crypto_tfm *tfm;
+	int cpu;
+	
+	plen = skb->len;
+	dlen = IPCOMP_SCRATCH_SIZE;
+	start = skb->data;
+
+	cpu = get_cpu();
+	scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+	tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+
+	err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen);
+	if (err)
+		goto out;
+
+	if (dlen < (plen + sizeof(struct ip_comp_hdr))) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC);
+	if (err)
+		goto out;
+		
+	skb_put(skb, dlen - plen);
+	memcpy(skb->data, scratch, dlen);
+	iph = skb->nh.iph;
+	iph->tot_len = htons(dlen + iph->ihl * 4);
+out:	
+	put_cpu();
+	return err;
+}
+
+static int ipcomp_input(struct xfrm_state *x,
+                        struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+	u8 nexthdr;
+	int err = 0;
+	struct iphdr *iph;
+	union {
+		struct iphdr	iph;
+		char 		buf[60];
+	} tmp_iph;
+
+
+	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+	    skb_linearize(skb, GFP_ATOMIC) != 0) {
+	    	err = -ENOMEM;
+	    	goto out;
+	}
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	/* Remove ipcomp header and decompress original payload */	
+	iph = skb->nh.iph;
+	memcpy(&tmp_iph, iph, iph->ihl * 4);
+	nexthdr = *(u8 *)skb->data;
+	skb_pull(skb, sizeof(struct ip_comp_hdr));
+	skb->nh.raw += sizeof(struct ip_comp_hdr);
+	memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4);
+	iph = skb->nh.iph;
+	iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr));
+	iph->protocol = nexthdr;
+	skb->h.raw = skb->data;
+	err = ipcomp_decompress(x, skb);
+
+out:	
+	return err;
+}
+
+static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err, plen, dlen, ihlen;
+	struct iphdr *iph = skb->nh.iph;
+	struct ipcomp_data *ipcd = x->data;
+	u8 *start, *scratch;
+	struct crypto_tfm *tfm;
+	int cpu;
+	
+	ihlen = iph->ihl * 4;
+	plen = skb->len - ihlen;
+	dlen = IPCOMP_SCRATCH_SIZE;
+	start = skb->data + ihlen;
+
+	cpu = get_cpu();
+	scratch = *per_cpu_ptr(ipcomp_scratches, cpu);
+	tfm = *per_cpu_ptr(ipcd->tfms, cpu);
+
+	err = crypto_comp_compress(tfm, start, plen, scratch, &dlen);
+	if (err)
+		goto out;
+
+	if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+	
+	memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen);
+	put_cpu();
+
+	pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr));
+	return 0;
+	
+out:	
+	put_cpu();
+	return err;
+}
+
+static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err;
+	struct iphdr *iph;
+	struct ip_comp_hdr *ipch;
+	struct ipcomp_data *ipcd = x->data;
+	int hdr_len = 0;
+
+	iph = skb->nh.iph;
+	iph->tot_len = htons(skb->len);
+	hdr_len = iph->ihl * 4;
+	if ((skb->len - hdr_len) < ipcd->threshold) {
+		/* Don't bother compressing */
+		goto out_ok;
+	}
+
+	if ((skb_is_nonlinear(skb) || skb_cloned(skb)) &&
+	    skb_linearize(skb, GFP_ATOMIC) != 0) {
+		goto out_ok;
+	}
+	
+	err = ipcomp_compress(x, skb);
+	iph = skb->nh.iph;
+
+	if (err) {
+		goto out_ok;
+	}
+
+	/* Install ipcomp header, convert into ipcomp datagram. */
+	iph->tot_len = htons(skb->len);
+	ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4);
+	ipch->nexthdr = iph->protocol;
+	ipch->flags = 0;
+	ipch->cpi = htons((u16 )ntohl(x->id.spi));
+	iph->protocol = IPPROTO_COMP;
+	ip_send_check(iph);
+	return 0;
+
+out_ok:
+	if (x->props.mode)
+		ip_send_check(iph);
+	return 0;
+}
+
+static void ipcomp4_err(struct sk_buff *skb, u32 info)
+{
+	u32 spi;
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+	struct xfrm_state *x;
+
+	if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+	    skb->h.icmph->code != ICMP_FRAG_NEEDED)
+		return;
+
+	spi = ntohl(ntohs(ipch->cpi));
+	x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr,
+	                      spi, IPPROTO_COMP, AF_INET);
+	if (!x)
+		return;
+	NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n",
+	       spi, NIPQUAD(iph->daddr)));
+	xfrm_state_put(x);
+}
+
+/* We always hold one tunnel user reference to indicate a tunnel */ 
+static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
+{
+	struct xfrm_state *t;
+	
+	t = xfrm_state_alloc();
+	if (t == NULL)
+		goto out;
+
+	t->id.proto = IPPROTO_IPIP;
+	t->id.spi = x->props.saddr.a4;
+	t->id.daddr.a4 = x->id.daddr.a4;
+	memcpy(&t->sel, &x->sel, sizeof(t->sel));
+	t->props.family = AF_INET;
+	t->props.mode = 1;
+	t->props.saddr.a4 = x->props.saddr.a4;
+	t->props.flags = x->props.flags;
+	
+	t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family);
+	if (t->type == NULL)
+		goto error;
+		
+	if (t->type->init_state(t, NULL))
+		goto error;
+
+	t->km.state = XFRM_STATE_VALID;
+	atomic_set(&t->tunnel_users, 1);
+out:
+	return t;
+
+error:
+	t->km.state = XFRM_STATE_DEAD;
+	xfrm_state_put(t);
+	t = NULL;
+	goto out;
+}
+
+/*
+ * Must be protected by xfrm_cfg_sem.  State and tunnel user references are
+ * always incremented on success.
+ */
+static int ipcomp_tunnel_attach(struct xfrm_state *x)
+{
+	int err = 0;
+	struct xfrm_state *t;
+
+	t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4,
+	                      x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
+	if (!t) {
+		t = ipcomp_tunnel_create(x);
+		if (!t) {
+			err = -EINVAL;
+			goto out;
+		}
+		xfrm_state_insert(t);
+		xfrm_state_hold(t);
+	}
+	x->tunnel = t;
+	atomic_inc(&t->tunnel_users);
+out:
+	return err;
+}
+
+static void ipcomp_free_scratches(void)
+{
+	int i;
+	void **scratches;
+
+	if (--ipcomp_scratch_users)
+		return;
+
+	scratches = ipcomp_scratches;
+	if (!scratches)
+		return;
+
+	for_each_cpu(i) {
+		void *scratch = *per_cpu_ptr(scratches, i);
+		if (scratch)
+			vfree(scratch);
+	}
+
+	free_percpu(scratches);
+}
+
+static void **ipcomp_alloc_scratches(void)
+{
+	int i;
+	void **scratches;
+
+	if (ipcomp_scratch_users++)
+		return ipcomp_scratches;
+
+	scratches = alloc_percpu(void *);
+	if (!scratches)
+		return NULL;
+
+	ipcomp_scratches = scratches;
+
+	for_each_cpu(i) {
+		void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE);
+		if (!scratch)
+			return NULL;
+		*per_cpu_ptr(scratches, i) = scratch;
+	}
+
+	return scratches;
+}
+
+static void ipcomp_free_tfms(struct crypto_tfm **tfms)
+{
+	struct ipcomp_tfms *pos;
+	int cpu;
+
+	list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+		if (pos->tfms == tfms)
+			break;
+	}
+
+	BUG_TRAP(pos);
+
+	if (--pos->users)
+		return;
+
+	list_del(&pos->list);
+	kfree(pos);
+
+	if (!tfms)
+		return;
+
+	for_each_cpu(cpu) {
+		struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu);
+		if (tfm)
+			crypto_free_tfm(tfm);
+	}
+	free_percpu(tfms);
+}
+
+static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name)
+{
+	struct ipcomp_tfms *pos;
+	struct crypto_tfm **tfms;
+	int cpu;
+
+	/* This can be any valid CPU ID so we don't need locking. */
+	cpu = smp_processor_id();
+
+	list_for_each_entry(pos, &ipcomp_tfms_list, list) {
+		struct crypto_tfm *tfm;
+
+		tfms = pos->tfms;
+		tfm = *per_cpu_ptr(tfms, cpu);
+
+		if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) {
+			pos->users++;
+			return tfms;
+		}
+	}
+
+	pos = kmalloc(sizeof(*pos), GFP_KERNEL);
+	if (!pos)
+		return NULL;
+
+	pos->users = 1;
+	INIT_LIST_HEAD(&pos->list);
+	list_add(&pos->list, &ipcomp_tfms_list);
+
+	pos->tfms = tfms = alloc_percpu(struct crypto_tfm *);
+	if (!tfms)
+		goto error;
+
+	for_each_cpu(cpu) {
+		struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0);
+		if (!tfm)
+			goto error;
+		*per_cpu_ptr(tfms, cpu) = tfm;
+	}
+
+	return tfms;
+
+error:
+	ipcomp_free_tfms(tfms);
+	return NULL;
+}
+
+static void ipcomp_free_data(struct ipcomp_data *ipcd)
+{
+	if (ipcd->tfms)
+		ipcomp_free_tfms(ipcd->tfms);
+	ipcomp_free_scratches();
+}
+
+static void ipcomp_destroy(struct xfrm_state *x)
+{
+	struct ipcomp_data *ipcd = x->data;
+	if (!ipcd)
+		return;
+	xfrm_state_delete_tunnel(x);
+	down(&ipcomp_resource_sem);
+	ipcomp_free_data(ipcd);
+	up(&ipcomp_resource_sem);
+	kfree(ipcd);
+}
+
+static int ipcomp_init_state(struct xfrm_state *x, void *args)
+{
+	int err;
+	struct ipcomp_data *ipcd;
+	struct xfrm_algo_desc *calg_desc;
+
+	err = -EINVAL;
+	if (!x->calg)
+		goto out;
+
+	if (x->encap)
+		goto out;
+
+	err = -ENOMEM;
+	ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL);
+	if (!ipcd)
+		goto out;
+
+	memset(ipcd, 0, sizeof(*ipcd));
+	x->props.header_len = 0;
+	if (x->props.mode)
+		x->props.header_len += sizeof(struct iphdr);
+
+	down(&ipcomp_resource_sem);
+	if (!ipcomp_alloc_scratches())
+		goto error;
+
+	ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name);
+	if (!ipcd->tfms)
+		goto error;
+	up(&ipcomp_resource_sem);
+
+	if (x->props.mode) {
+		err = ipcomp_tunnel_attach(x);
+		if (err)
+			goto error_tunnel;
+	}
+
+	calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0);
+	BUG_ON(!calg_desc);
+	ipcd->threshold = calg_desc->uinfo.comp.threshold;
+	x->data = ipcd;
+	err = 0;
+out:
+	return err;
+
+error_tunnel:
+	down(&ipcomp_resource_sem);
+error:
+	ipcomp_free_data(ipcd);
+	up(&ipcomp_resource_sem);
+	kfree(ipcd);
+	goto out;
+}
+
+static struct xfrm_type ipcomp_type = {
+	.description	= "IPCOMP4",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_COMP,
+	.init_state	= ipcomp_init_state,
+	.destructor	= ipcomp_destroy,
+	.input		= ipcomp_input,
+	.output		= ipcomp_output
+};
+
+static struct net_protocol ipcomp4_protocol = {
+	.handler	=	xfrm4_rcv,
+	.err_handler	=	ipcomp4_err,
+	.no_policy	=	1,
+};
+
+static int __init ipcomp4_init(void)
+{
+	if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) {
+		printk(KERN_INFO "ipcomp init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) {
+		printk(KERN_INFO "ipcomp init: can't add protocol\n");
+		xfrm_unregister_type(&ipcomp_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ipcomp4_fini(void)
+{
+	if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0)
+		printk(KERN_INFO "ip ipcomp close: can't remove protocol\n");
+	if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0)
+		printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n");
+}
+
+module_init(ipcomp4_init);
+module_exit(ipcomp4_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
new file mode 100644
index 000000000000..f2509034ce72
--- /dev/null
+++ b/net/ipv4/ipconfig.c
@@ -0,0 +1,1507 @@
+/*
+ *  $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $
+ *
+ *  Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or
+ *  user-supplied information to configure own IP address and routes.
+ *
+ *  Copyright (C) 1996-1998 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ *  Derived from network configuration code in fs/nfs/nfsroot.c,
+ *  originally Copyright (C) 1995, 1996 Gero Kuhlmann and me.
+ *
+ *  BOOTP rewritten to construct and analyse packets itself instead
+ *  of misusing the IP layer. num_bugs_causing_wrong_arp_replies--;
+ *					     -- MJ, December 1998
+ *  
+ *  Fixed ip_auto_config_setup calling at startup in the new "Linker Magic"
+ *  initialization scheme.
+ *	- Arnaldo Carvalho de Melo <acme@conectiva.com.br>, 08/11/1999
+ *
+ *  DHCP support added.  To users this looks like a whole separate
+ *  protocol, but we know it's just a bag on the side of BOOTP.
+ *		-- Chip Salzenberg <chip@valinux.com>, May 2000
+ *
+ *  Ported DHCP support from 2.2.16 to 2.4.0-test4
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 30 Aug 2000
+ *
+ *  Merged changes from 2.2.19 into 2.4.3
+ *              -- Eric Biederman <ebiederman@lnxi.com>, 22 April Aug 2001
+ *
+ *  Multiple Nameservers in /proc/net/pnp
+ *              --  Josef Siemes <jsiemes@web.de>, Aug 2002
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <linux/jiffies.h>
+#include <linux/random.h>
+#include <linux/init.h>
+#include <linux/utsname.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/socket.h>
+#include <linux/route.h>
+#include <linux/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/major.h>
+#include <linux/root_dev.h>
+#include <linux/delay.h>
+#include <net/arp.h>
+#include <net/ip.h>
+#include <net/ipconfig.h>
+
+#include <asm/uaccess.h>
+#include <net/checksum.h>
+#include <asm/processor.h>
+
+/* Define this to allow debugging output */
+#undef IPCONFIG_DEBUG
+
+#ifdef IPCONFIG_DEBUG
+#define DBG(x) printk x
+#else
+#define DBG(x) do { } while(0)
+#endif
+
+#if defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_DHCP
+#endif
+#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP)
+#define IPCONFIG_BOOTP
+#endif
+#if defined(CONFIG_IP_PNP_RARP)
+#define IPCONFIG_RARP
+#endif
+#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP)
+#define IPCONFIG_DYNAMIC
+#endif
+
+/* Define the friendly delay before and after opening net devices */
+#define CONF_PRE_OPEN		500	/* Before opening: 1/2 second */
+#define CONF_POST_OPEN		1	/* After opening: 1 second */
+
+/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */
+#define CONF_OPEN_RETRIES 	2	/* (Re)open devices twice */
+#define CONF_SEND_RETRIES 	6	/* Send six requests per open */
+#define CONF_INTER_TIMEOUT	(HZ/2)	/* Inter-device timeout: 1/2 second */
+#define CONF_BASE_TIMEOUT	(HZ*2)	/* Initial timeout: 2 seconds */
+#define CONF_TIMEOUT_RANDOM	(HZ)	/* Maximum amount of randomization */
+#define CONF_TIMEOUT_MULT	*7/4	/* Rate of timeout growth */
+#define CONF_TIMEOUT_MAX	(HZ*30)	/* Maximum allowed timeout */
+#define CONF_NAMESERVERS_MAX   3       /* Maximum number of nameservers  
+                                           - '3' from resolv.h */
+
+
+/*
+ * Public IP configuration
+ */
+
+/* This is used by platforms which might be able to set the ipconfig
+ * variables using firmware environment vars.  If this is set, it will
+ * ignore such firmware variables.
+ */
+int ic_set_manually __initdata = 0;		/* IPconfig parameters set manually */
+
+static int ic_enable __initdata = 0;		/* IP config enabled? */
+
+/* Protocol choice */
+int ic_proto_enabled __initdata = 0
+#ifdef IPCONFIG_BOOTP
+			| IC_BOOTP
+#endif
+#ifdef CONFIG_IP_PNP_DHCP
+			| IC_USE_DHCP
+#endif
+#ifdef IPCONFIG_RARP
+			| IC_RARP
+#endif
+			;
+
+static int ic_host_name_set __initdata = 0;	/* Host name set by us? */
+
+u32 ic_myaddr = INADDR_NONE;		/* My IP address */
+static u32 ic_netmask = INADDR_NONE;	/* Netmask for local subnet */
+u32 ic_gateway = INADDR_NONE;	/* Gateway IP address */
+
+u32 ic_servaddr = INADDR_NONE;	/* Boot server IP address */
+
+u32 root_server_addr = INADDR_NONE;	/* Address of NFS server */
+u8 root_server_path[256] = { 0, };	/* Path to mount as root */
+
+/* Persistent data: */
+
+static int ic_proto_used;			/* Protocol used, if any */
+static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
+static u8 ic_domain[64];		/* DNS (not NIS) domain name */
+
+/*
+ * Private state.
+ */
+
+/* Name of user-selected boot device */
+static char user_dev_name[IFNAMSIZ] __initdata = { 0, };
+
+/* Protocols supported by available interfaces */
+static int ic_proto_have_if __initdata = 0;
+
+#ifdef IPCONFIG_DYNAMIC
+static DEFINE_SPINLOCK(ic_recv_lock);
+static volatile int ic_got_reply __initdata = 0;    /* Proto(s) that replied */
+#endif
+#ifdef IPCONFIG_DHCP
+static int ic_dhcp_msgtype __initdata = 0;	/* DHCP msg type received */
+#endif
+
+
+/*
+ *	Network devices
+ */
+
+struct ic_device {
+	struct ic_device *next;
+	struct net_device *dev;
+	unsigned short flags;
+	short able;
+	u32 xid;
+};
+
+static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
+static struct net_device *ic_dev __initdata = NULL;	/* Selected device */
+
+static int __init ic_open_devs(void)
+{
+	struct ic_device *d, **last;
+	struct net_device *dev;
+	unsigned short oflags;
+
+	last = &ic_first_dev;
+	rtnl_shlock();
+
+	/* bring loopback device up first */
+	if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0)
+		printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name);
+
+	for (dev = dev_base; dev; dev = dev->next) {
+		if (dev == &loopback_dev)
+			continue;
+		if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+		    (!(dev->flags & IFF_LOOPBACK) &&
+		     (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+		     strncmp(dev->name, "dummy", 5))) {
+			int able = 0;
+			if (dev->mtu >= 364)
+				able |= IC_BOOTP;
+			else
+				printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu);
+			if (!(dev->flags & IFF_NOARP))
+				able |= IC_RARP;
+			able &= ic_proto_enabled;
+			if (ic_proto_enabled && !able)
+				continue;
+			oflags = dev->flags;
+			if (dev_change_flags(dev, oflags | IFF_UP) < 0) {
+				printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name);
+				continue;
+			}
+			if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
+				rtnl_shunlock();
+				return -1;
+			}
+			d->dev = dev;
+			*last = d;
+			last = &d->next;
+			d->flags = oflags;
+			d->able = able;
+			if (able & IC_BOOTP)
+				get_random_bytes(&d->xid, sizeof(u32));
+			else
+				d->xid = 0;
+			ic_proto_have_if |= able;
+			DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
+				dev->name, able, d->xid));
+		}
+	}
+	rtnl_shunlock();
+
+	*last = NULL;
+
+	if (!ic_first_dev) {
+		if (user_dev_name[0])
+			printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
+		else
+			printk(KERN_ERR "IP-Config: No network devices available.\n");
+		return -1;
+	}
+	return 0;
+}
+
+static void __init ic_close_devs(void)
+{
+	struct ic_device *d, *next;
+	struct net_device *dev;
+
+	rtnl_shlock();
+	next = ic_first_dev;
+	while ((d = next)) {
+		next = d->next;
+		dev = d->dev;
+		if (dev != ic_dev) {
+			DBG(("IP-Config: Downing %s\n", dev->name));
+			dev_change_flags(dev, d->flags);
+		}
+		kfree(d);
+	}
+	rtnl_shunlock();
+}
+
+/*
+ *	Interface to various network functions.
+ */
+
+static inline void
+set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port)
+{
+	sin->sin_family = AF_INET;
+	sin->sin_addr.s_addr = addr;
+	sin->sin_port = port;
+}
+
+static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg)
+{
+	int res;
+
+	mm_segment_t oldfs = get_fs();
+	set_fs(get_ds());
+	res = devinet_ioctl(cmd, (struct ifreq __user *) arg);
+	set_fs(oldfs);
+	return res;
+}
+
+static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg)
+{
+	int res;
+
+	mm_segment_t oldfs = get_fs();
+	set_fs(get_ds());
+	res = ip_rt_ioctl(cmd, (void __user *) arg);
+	set_fs(oldfs);
+	return res;
+}
+
+/*
+ *	Set up interface addresses and routes.
+ */
+
+static int __init ic_setup_if(void)
+{
+	struct ifreq ir;
+	struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr;
+	int err;
+
+	memset(&ir, 0, sizeof(ir));
+	strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name);
+	set_sockaddr(sin, ic_myaddr, 0);
+	if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) {
+		printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err);
+		return -1;
+	}
+	set_sockaddr(sin, ic_netmask, 0);
+	if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) {
+		printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err);
+		return -1;
+	}
+	set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0);
+	if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) {
+		printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err);
+		return -1;
+	}
+	return 0;
+}
+
+static int __init ic_setup_routes(void)
+{
+	/* No need to setup device routes, only the default route... */
+
+	if (ic_gateway != INADDR_NONE) {
+		struct rtentry rm;
+		int err;
+
+		memset(&rm, 0, sizeof(rm));
+		if ((ic_gateway ^ ic_myaddr) & ic_netmask) {
+			printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n");
+			return -1;
+		}
+		set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0);
+		set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0);
+		set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0);
+		rm.rt_flags = RTF_UP | RTF_GATEWAY;
+		if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) {
+			printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ *	Fill in default values for all missing parameters.
+ */
+
+static int __init ic_defaults(void)
+{
+	/*
+	 *	At this point we have no userspace running so need not
+	 *	claim locks on system_utsname
+	 */
+	 
+	if (!ic_host_name_set)
+		sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+
+	if (root_server_addr == INADDR_NONE)
+		root_server_addr = ic_servaddr;
+
+	if (ic_netmask == INADDR_NONE) {
+		if (IN_CLASSA(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSA_NET);
+		else if (IN_CLASSB(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSB_NET);
+		else if (IN_CLASSC(ntohl(ic_myaddr)))
+			ic_netmask = htonl(IN_CLASSC_NET);
+		else {
+			printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n",
+				NIPQUAD(ic_myaddr));
+			return -1;
+		}
+		printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask));
+	}
+
+	return 0;
+}
+
+/*
+ *	RARP support.
+ */
+
+#ifdef IPCONFIG_RARP
+
+static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+
+static struct packet_type rarp_packet_type __initdata = {
+	.type =	__constant_htons(ETH_P_RARP),
+	.func =	ic_rarp_recv,
+};
+
+static inline void ic_rarp_init(void)
+{
+	dev_add_pack(&rarp_packet_type);
+}
+
+static inline void ic_rarp_cleanup(void)
+{
+	dev_remove_pack(&rarp_packet_type);
+}
+
+/*
+ *  Process received RARP packet.
+ */
+static int __init
+ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+	struct arphdr *rarp;
+	unsigned char *rarp_ptr;
+	unsigned long sip, tip;
+	unsigned char *sha, *tha;		/* s for "source", t for "target" */
+	struct ic_device *d;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb, sizeof(struct arphdr)))
+		goto drop;
+
+	/* Basic sanity checks can be done without the lock.  */
+	rarp = (struct arphdr *)skb->h.raw;
+
+	/* If this test doesn't pass, it's not IP, or we should
+	 * ignore it anyway.
+	 */
+	if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd))
+		goto drop;
+
+	/* If it's not a RARP reply, delete it. */
+	if (rarp->ar_op != htons(ARPOP_RREPLY))
+		goto drop;
+
+	/* If it's not Ethernet, delete it. */
+	if (rarp->ar_pro != htons(ETH_P_IP))
+		goto drop;
+
+	if (!pskb_may_pull(skb,
+			   sizeof(struct arphdr) +
+			   (2 * dev->addr_len) +
+			   (2 * 4)))
+		goto drop;
+
+	/* OK, it is all there and looks valid, process... */
+	rarp = (struct arphdr *)skb->h.raw;
+	rarp_ptr = (unsigned char *) (rarp + 1);
+
+	/* One reply at a time, please. */
+	spin_lock(&ic_recv_lock);
+
+	/* If we already have a reply, just drop the packet */
+	if (ic_got_reply)
+		goto drop_unlock;
+
+	/* Find the ic_device that the packet arrived on */
+	d = ic_first_dev;
+	while (d && d->dev != dev)
+		d = d->next;
+	if (!d)
+		goto drop_unlock;	/* should never happen */
+
+	/* Extract variable-width fields */
+	sha = rarp_ptr;
+	rarp_ptr += dev->addr_len;
+	memcpy(&sip, rarp_ptr, 4);
+	rarp_ptr += 4;
+	tha = rarp_ptr;
+	rarp_ptr += dev->addr_len;
+	memcpy(&tip, rarp_ptr, 4);
+
+	/* Discard packets which are not meant for us. */
+	if (memcmp(tha, dev->dev_addr, dev->addr_len))
+		goto drop_unlock;
+
+	/* Discard packets which are not from specified server. */
+	if (ic_servaddr != INADDR_NONE && ic_servaddr != sip)
+		goto drop_unlock;
+
+	/* We have a winner! */
+	ic_dev = dev;
+	if (ic_myaddr == INADDR_NONE)
+		ic_myaddr = tip;
+	ic_servaddr = sip;
+	ic_got_reply = IC_RARP;
+
+drop_unlock:
+	/* Show's over.  Nothing to see here.  */
+	spin_unlock(&ic_recv_lock);
+
+drop:
+	/* Throw the packet out. */
+	kfree_skb(skb);
+	return 0;
+}
+
+
+/*
+ *  Send RARP request packet over a single interface.
+ */
+static void __init ic_rarp_send_if(struct ic_device *d)
+{
+	struct net_device *dev = d->dev;
+	arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL,
+		 dev->dev_addr, dev->dev_addr);
+}
+#endif
+
+/*
+ *	DHCP/BOOTP support.
+ */
+
+#ifdef IPCONFIG_BOOTP
+
+struct bootp_pkt {		/* BOOTP packet format */
+	struct iphdr iph;	/* IP header */
+	struct udphdr udph;	/* UDP header */
+	u8 op;			/* 1=request, 2=reply */
+	u8 htype;		/* HW address type */
+	u8 hlen;		/* HW address length */
+	u8 hops;		/* Used only by gateways */
+	u32 xid;		/* Transaction ID */
+	u16 secs;		/* Seconds since we started */
+	u16 flags;		/* Just what it says */
+	u32 client_ip;		/* Client's IP address if known */
+	u32 your_ip;		/* Assigned IP address */
+	u32 server_ip;		/* (Next, e.g. NFS) Server's IP address */
+	u32 relay_ip;		/* IP address of BOOTP relay */
+	u8 hw_addr[16];		/* Client's HW address */
+	u8 serv_name[64];	/* Server host name */
+	u8 boot_file[128];	/* Name of boot file */
+	u8 exten[312];		/* DHCP options / BOOTP vendor extensions */
+};
+
+/* packet ops */
+#define BOOTP_REQUEST	1
+#define BOOTP_REPLY	2
+
+/* DHCP message types */
+#define DHCPDISCOVER	1
+#define DHCPOFFER	2
+#define DHCPREQUEST	3
+#define DHCPDECLINE	4
+#define DHCPACK		5
+#define DHCPNAK		6
+#define DHCPRELEASE	7
+#define DHCPINFORM	8
+
+static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt);
+
+static struct packet_type bootp_packet_type __initdata = {
+	.type =	__constant_htons(ETH_P_IP),
+	.func =	ic_bootp_recv,
+};
+
+
+/*
+ *  Initialize DHCP/BOOTP extension fields in the request.
+ */
+
+static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
+
+#ifdef IPCONFIG_DHCP
+
+static void __init
+ic_dhcp_init_options(u8 *options)
+{
+	u8 mt = ((ic_servaddr == INADDR_NONE)
+		 ? DHCPDISCOVER : DHCPREQUEST);
+	u8 *e = options;
+
+#ifdef IPCONFIG_DEBUG
+	printk("DHCP: Sending message type %d\n", mt);
+#endif
+
+	memcpy(e, ic_bootp_cookie, 4);	/* RFC1048 Magic Cookie */
+	e += 4;
+
+	*e++ = 53;		/* DHCP message type */
+	*e++ = 1;
+	*e++ = mt;
+
+	if (mt == DHCPREQUEST) {
+		*e++ = 54;	/* Server ID (IP address) */
+		*e++ = 4;
+		memcpy(e, &ic_servaddr, 4);
+		e += 4;
+
+		*e++ = 50;	/* Requested IP address */
+		*e++ = 4;
+		memcpy(e, &ic_myaddr, 4);
+		e += 4;
+	}
+
+	/* always? */
+	{
+		static const u8 ic_req_params[] = {
+			1,	/* Subnet mask */
+			3,	/* Default gateway */
+			6,	/* DNS server */
+			12,	/* Host name */
+			15,	/* Domain name */
+			17,	/* Boot path */
+			40,	/* NIS domain name */
+		};
+
+		*e++ = 55;	/* Parameter request list */
+		*e++ = sizeof(ic_req_params);
+		memcpy(e, ic_req_params, sizeof(ic_req_params));
+		e += sizeof(ic_req_params);
+	}
+
+	*e++ = 255;	/* End of the list */
+}
+
+#endif /* IPCONFIG_DHCP */
+
+static void __init ic_bootp_init_ext(u8 *e)
+{
+	memcpy(e, ic_bootp_cookie, 4);	/* RFC1048 Magic Cookie */
+	e += 4;
+	*e++ = 1;		/* Subnet mask request */
+	*e++ = 4;
+	e += 4;
+	*e++ = 3;		/* Default gateway request */
+	*e++ = 4;
+	e += 4;
+	*e++ = 5;		/* Name server request */
+	*e++ = 8;
+	e += 8;
+	*e++ = 12;		/* Host name request */
+	*e++ = 32;
+	e += 32;
+	*e++ = 40;		/* NIS Domain name request */
+	*e++ = 32;
+	e += 32;
+	*e++ = 17;		/* Boot path */
+	*e++ = 40;
+	e += 40;
+
+	*e++ = 57;		/* set extension buffer size for reply */ 
+	*e++ = 2;
+	*e++ = 1;		/* 128+236+8+20+14, see dhcpd sources */ 
+	*e++ = 150;
+
+	*e++ = 255;		/* End of the list */
+}
+
+
+/*
+ *  Initialize the DHCP/BOOTP mechanism.
+ */
+static inline void ic_bootp_init(void)
+{
+	int i;
+
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
+		ic_nameservers[i] = INADDR_NONE;
+
+	dev_add_pack(&bootp_packet_type);
+}
+
+
+/*
+ *  DHCP/BOOTP cleanup.
+ */
+static inline void ic_bootp_cleanup(void)
+{
+	dev_remove_pack(&bootp_packet_type);
+}
+
+
+/*
+ *  Send DHCP/BOOTP request to single interface.
+ */
+static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff)
+{
+	struct net_device *dev = d->dev;
+	struct sk_buff *skb;
+	struct bootp_pkt *b;
+	int hh_len = LL_RESERVED_SPACE(dev);
+	struct iphdr *h;
+
+	/* Allocate packet */
+	skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL);
+	if (!skb)
+		return;
+	skb_reserve(skb, hh_len);
+	b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt));
+	memset(b, 0, sizeof(struct bootp_pkt));
+
+	/* Construct IP header */
+	skb->nh.iph = h = &b->iph;
+	h->version = 4;
+	h->ihl = 5;
+	h->tot_len = htons(sizeof(struct bootp_pkt));
+	h->frag_off = htons(IP_DF);
+	h->ttl = 64;
+	h->protocol = IPPROTO_UDP;
+	h->daddr = INADDR_BROADCAST;
+	h->check = ip_fast_csum((unsigned char *) h, h->ihl);
+
+	/* Construct UDP header */
+	b->udph.source = htons(68);
+	b->udph.dest = htons(67);
+	b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr));
+	/* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
+
+	/* Construct DHCP/BOOTP header */
+	b->op = BOOTP_REQUEST;
+	if (dev->type < 256) /* check for false types */
+		b->htype = dev->type;
+	else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */
+		b->htype = ARPHRD_IEEE802;
+	else if (dev->type == ARPHRD_FDDI)
+		b->htype = ARPHRD_ETHER;
+	else {
+		printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+		b->htype = dev->type; /* can cause undefined behavior */
+	}
+	b->hlen = dev->addr_len;
+	b->your_ip = INADDR_NONE;
+	b->server_ip = INADDR_NONE;
+	memcpy(b->hw_addr, dev->dev_addr, dev->addr_len);
+	b->secs = htons(jiffies_diff / HZ);
+	b->xid = d->xid;
+
+	/* add DHCP options or BOOTP extensions */
+#ifdef IPCONFIG_DHCP
+	if (ic_proto_enabled & IC_USE_DHCP)
+		ic_dhcp_init_options(b->exten);
+	else
+#endif
+		ic_bootp_init_ext(b->exten);
+
+	/* Chain packet down the line... */
+	skb->dev = dev;
+	skb->protocol = htons(ETH_P_IP);
+	if ((dev->hard_header &&
+	     dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) ||
+	    dev_queue_xmit(skb) < 0)
+		printk("E");
+}
+
+
+/*
+ *  Copy BOOTP-supplied string if not already set.
+ */
+static int __init ic_bootp_string(char *dest, char *src, int len, int max)
+{
+	if (!len)
+		return 0;
+	if (len > max-1)
+		len = max-1;
+	memcpy(dest, src, len);
+	dest[len] = '\0';
+	return 1;
+}
+
+
+/*
+ *  Process BOOTP extensions.
+ */
+static void __init ic_do_bootp_ext(u8 *ext)
+{
+       u8 servers;
+       int i;
+
+#ifdef IPCONFIG_DEBUG
+	u8 *c;
+
+	printk("DHCP/BOOTP: Got extension %d:",*ext);
+	for(c=ext+2; c<ext+2+ext[1]; c++)
+		printk(" %02x", *c);
+	printk("\n");
+#endif
+
+	switch (*ext++) {
+		case 1:		/* Subnet mask */
+			if (ic_netmask == INADDR_NONE)
+				memcpy(&ic_netmask, ext+1, 4);
+			break;
+		case 3:		/* Default gateway */
+			if (ic_gateway == INADDR_NONE)
+				memcpy(&ic_gateway, ext+1, 4);
+			break;
+		case 6:		/* DNS server */
+			servers= *ext/4;
+			if (servers > CONF_NAMESERVERS_MAX)
+				servers = CONF_NAMESERVERS_MAX;
+			for (i = 0; i < servers; i++) {
+				if (ic_nameservers[i] == INADDR_NONE)
+					memcpy(&ic_nameservers[i], ext+1+4*i, 4);
+			}
+			break;
+		case 12:	/* Host name */
+			ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN);
+			ic_host_name_set = 1;
+			break;
+		case 15:	/* Domain name (DNS) */
+			ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain));
+			break;
+		case 17:	/* Root path */
+			if (!root_server_path[0])
+				ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path));
+			break;
+		case 40:	/* NIS Domain name (_not_ DNS) */
+			ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN);
+			break;
+	}
+}
+
+
+/*
+ *  Receive BOOTP reply.
+ */
+static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt)
+{
+	struct bootp_pkt *b;
+	struct iphdr *h;
+	struct ic_device *d;
+	int len, ext_len;
+
+	/* Perform verifications before taking the lock.  */
+	if (skb->pkt_type == PACKET_OTHERHOST)
+		goto drop;
+
+	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+		return NET_RX_DROP;
+
+	if (!pskb_may_pull(skb,
+			   sizeof(struct iphdr) +
+			   sizeof(struct udphdr)))
+		goto drop;
+
+	b = (struct bootp_pkt *) skb->nh.iph;
+	h = &b->iph;
+
+	if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
+		goto drop;
+
+	/* Fragments are not supported */
+	if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
+		if (net_ratelimit())
+			printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented "
+			       "reply.\n");
+		goto drop;
+	}
+
+	if (skb->len < ntohs(h->tot_len))
+		goto drop;
+
+	if (ip_fast_csum((char *) h, h->ihl))
+		goto drop;
+
+	if (b->udph.source != htons(67) || b->udph.dest != htons(68))
+		goto drop;
+
+	if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
+		goto drop;
+
+	len = ntohs(b->udph.len) - sizeof(struct udphdr);
+	ext_len = len - (sizeof(*b) -
+			 sizeof(struct iphdr) -
+			 sizeof(struct udphdr) -
+			 sizeof(b->exten));
+	if (ext_len < 0)
+		goto drop;
+
+	/* Ok the front looks good, make sure we can get at the rest.  */
+	if (!pskb_may_pull(skb, skb->len))
+		goto drop;
+
+	b = (struct bootp_pkt *) skb->nh.iph;
+	h = &b->iph;
+
+	/* One reply at a time, please. */
+	spin_lock(&ic_recv_lock);
+
+	/* If we already have a reply, just drop the packet */
+	if (ic_got_reply)
+		goto drop_unlock;
+
+	/* Find the ic_device that the packet arrived on */
+	d = ic_first_dev;
+	while (d && d->dev != dev)
+		d = d->next;
+	if (!d)
+		goto drop_unlock;  /* should never happen */
+
+	/* Is it a reply to our BOOTP request? */
+	if (b->op != BOOTP_REPLY ||
+	    b->xid != d->xid) {
+		if (net_ratelimit())
+			printk(KERN_ERR "DHCP/BOOTP: Reply not for us, "
+			       "op[%x] xid[%x]\n",
+			       b->op, b->xid);
+		goto drop_unlock;
+	}
+
+	/* Parse extensions */
+	if (ext_len >= 4 &&
+	    !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */
+                u8 *end = (u8 *) b + ntohs(b->iph.tot_len);
+		u8 *ext;
+
+#ifdef IPCONFIG_DHCP
+		if (ic_proto_enabled & IC_USE_DHCP) {
+			u32 server_id = INADDR_NONE;
+			int mt = 0;
+
+			ext = &b->exten[4];
+			while (ext < end && *ext != 0xff) {
+				u8 *opt = ext++;
+				if (*opt == 0)	/* Padding */
+					continue;
+				ext += *ext + 1;
+				if (ext >= end)
+					break;
+				switch (*opt) {
+				case 53:	/* Message type */
+					if (opt[1])
+						mt = opt[2];
+					break;
+				case 54:	/* Server ID (IP address) */
+					if (opt[1] >= 4)
+						memcpy(&server_id, opt + 2, 4);
+					break;
+				};
+			}
+
+#ifdef IPCONFIG_DEBUG
+			printk("DHCP: Got message type %d\n", mt);
+#endif
+
+			switch (mt) {
+			case DHCPOFFER:
+				/* While in the process of accepting one offer,
+				 * ignore all others.
+				 */
+				if (ic_myaddr != INADDR_NONE)
+					goto drop_unlock;
+
+				/* Let's accept that offer. */
+				ic_myaddr = b->your_ip;
+				ic_servaddr = server_id;
+#ifdef IPCONFIG_DEBUG
+				printk("DHCP: Offered address %u.%u.%u.%u",
+				       NIPQUAD(ic_myaddr));
+				printk(" by server %u.%u.%u.%u\n",
+				       NIPQUAD(ic_servaddr));
+#endif
+				/* The DHCP indicated server address takes
+				 * precedence over the bootp header one if
+				 * they are different.
+				 */
+				if ((server_id != INADDR_NONE) &&
+				    (b->server_ip != server_id))
+					b->server_ip = ic_servaddr;
+				break;
+
+			case DHCPACK:
+				if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0)
+					goto drop_unlock;
+
+				/* Yeah! */
+				break;
+
+			default:
+				/* Urque.  Forget it*/
+				ic_myaddr = INADDR_NONE;
+				ic_servaddr = INADDR_NONE;
+				goto drop_unlock;
+			};
+
+			ic_dhcp_msgtype = mt;
+
+		}
+#endif /* IPCONFIG_DHCP */
+
+		ext = &b->exten[4];
+		while (ext < end && *ext != 0xff) {
+			u8 *opt = ext++;
+			if (*opt == 0)	/* Padding */
+				continue;
+			ext += *ext + 1;
+			if (ext < end)
+				ic_do_bootp_ext(opt);
+		}
+	}
+
+	/* We have a winner! */
+	ic_dev = dev;
+	ic_myaddr = b->your_ip;
+	ic_servaddr = b->server_ip;
+	if (ic_gateway == INADDR_NONE && b->relay_ip)
+		ic_gateway = b->relay_ip;
+	if (ic_nameservers[0] == INADDR_NONE)
+		ic_nameservers[0] = ic_servaddr;
+	ic_got_reply = IC_BOOTP;
+
+drop_unlock:
+	/* Show's over.  Nothing to see here.  */
+	spin_unlock(&ic_recv_lock);
+
+drop:
+	/* Throw the packet out. */
+	kfree_skb(skb);
+
+	return 0;
+}	
+
+
+#endif
+
+
+/*
+ *	Dynamic IP configuration -- DHCP, BOOTP, RARP.
+ */
+
+#ifdef IPCONFIG_DYNAMIC
+
+static int __init ic_dynamic(void)
+{
+	int retries;
+	struct ic_device *d;
+	unsigned long start_jiffies, timeout, jiff;
+	int do_bootp = ic_proto_have_if & IC_BOOTP;
+	int do_rarp = ic_proto_have_if & IC_RARP;
+
+	/*
+	 * If none of DHCP/BOOTP/RARP was selected, return with an error.
+	 * This routine gets only called when some pieces of information
+	 * are missing, and without DHCP/BOOTP/RARP we are unable to get it.
+	 */
+	if (!ic_proto_enabled) {
+		printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+		return -1;
+	}
+
+#ifdef IPCONFIG_BOOTP
+	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP)
+		printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n");
+#endif
+#ifdef IPCONFIG_RARP
+	if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP)
+		printk(KERN_ERR "RARP: No suitable device found.\n");
+#endif
+
+	if (!ic_proto_have_if)
+		/* Error message already printed */
+		return -1;
+
+	/*
+	 * Setup protocols
+	 */
+#ifdef IPCONFIG_BOOTP
+	if (do_bootp)
+		ic_bootp_init();
+#endif
+#ifdef IPCONFIG_RARP
+	if (do_rarp)
+		ic_rarp_init();
+#endif
+
+	/*
+	 * Send requests and wait, until we get an answer. This loop
+	 * seems to be a terrible waste of CPU time, but actually there is
+	 * only one process running at all, so we don't need to use any
+	 * scheduler functions.
+	 * [Actually we could now, but the nothing else running note still 
+	 *  applies.. - AC]
+	 */
+	printk(KERN_NOTICE "Sending %s%s%s requests .",
+	       do_bootp
+		? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "",
+	       (do_bootp && do_rarp) ? " and " : "",
+	       do_rarp ? "RARP" : "");
+
+	start_jiffies = jiffies;
+	d = ic_first_dev;
+	retries = CONF_SEND_RETRIES;
+	get_random_bytes(&timeout, sizeof(timeout));
+	timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM);
+	for(;;) {
+#ifdef IPCONFIG_BOOTP
+		if (do_bootp && (d->able & IC_BOOTP))
+			ic_bootp_send_if(d, jiffies - start_jiffies);
+#endif
+#ifdef IPCONFIG_RARP
+		if (do_rarp && (d->able & IC_RARP))
+			ic_rarp_send_if(d);
+#endif
+
+		jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout);
+		while (time_before(jiffies, jiff) && !ic_got_reply) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(1);
+		}
+#ifdef IPCONFIG_DHCP
+		/* DHCP isn't done until we get a DHCPACK. */
+		if ((ic_got_reply & IC_BOOTP)
+		    && (ic_proto_enabled & IC_USE_DHCP)
+		    && ic_dhcp_msgtype != DHCPACK)
+		{
+			ic_got_reply = 0;
+			printk(",");
+			continue;
+		}
+#endif /* IPCONFIG_DHCP */
+
+		if (ic_got_reply) {
+			printk(" OK\n");
+			break;
+		}
+
+		if ((d = d->next))
+			continue;
+
+		if (! --retries) {
+			printk(" timed out!\n");
+			break;
+		}
+
+		d = ic_first_dev;
+
+		timeout = timeout CONF_TIMEOUT_MULT;
+		if (timeout > CONF_TIMEOUT_MAX)
+			timeout = CONF_TIMEOUT_MAX;
+
+		printk(".");
+	}
+
+#ifdef IPCONFIG_BOOTP
+	if (do_bootp)
+		ic_bootp_cleanup();
+#endif
+#ifdef IPCONFIG_RARP
+	if (do_rarp)
+		ic_rarp_cleanup();
+#endif
+
+	if (!ic_got_reply)
+		return -1;
+
+	printk("IP-Config: Got %s answer from %u.%u.%u.%u, ",
+		((ic_got_reply & IC_RARP) ? "RARP" 
+		 : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+		NIPQUAD(ic_servaddr));
+	printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr));
+
+	return 0;
+}
+
+#endif /* IPCONFIG_DYNAMIC */
+
+#ifdef CONFIG_PROC_FS
+
+static int pnp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	if (ic_proto_used & IC_PROTO)
+		seq_printf(seq, "#PROTO: %s\n",
+			   (ic_proto_used & IC_RARP) ? "RARP"
+			   : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP");
+	else
+		seq_puts(seq, "#MANUAL\n");
+
+	if (ic_domain[0])
+		seq_printf(seq,
+			   "domain %s\n", ic_domain);
+	for (i = 0; i < CONF_NAMESERVERS_MAX; i++) {
+		if (ic_nameservers[i] != INADDR_NONE)
+			seq_printf(seq,
+				   "nameserver %u.%u.%u.%u\n",
+				   NIPQUAD(ic_nameservers[i]));
+	}
+	if (ic_servaddr != INADDR_NONE)
+		seq_printf(seq,
+			   "bootserver %u.%u.%u.%u\n",
+			   NIPQUAD(ic_servaddr));
+	return 0;
+}
+
+static int pnp_seq_open(struct inode *indoe, struct file *file)
+{
+	return single_open(file, pnp_seq_show, NULL);
+}
+
+static struct file_operations pnp_seq_fops = {
+	.owner		= THIS_MODULE,
+	.open		= pnp_seq_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif /* CONFIG_PROC_FS */
+
+/*
+ *  Extract IP address from the parameter string if needed. Note that we
+ *  need to have root_server_addr set _before_ IPConfig gets called as it
+ *  can override it.
+ */
+u32 __init root_nfs_parse_addr(char *name)
+{
+	u32 addr;
+	int octets = 0;
+	char *cp, *cq;
+
+	cp = cq = name;
+	while (octets < 4) {
+		while (*cp >= '0' && *cp <= '9')
+			cp++;
+		if (cp == cq || cp - cq > 3)
+			break;
+		if (*cp == '.' || octets == 3)
+			octets++;
+		if (octets < 4)
+			cp++;
+		cq = cp;
+	}
+	if (octets == 4 && (*cp == ':' || *cp == '\0')) {
+		if (*cp == ':')
+			*cp++ = '\0';
+		addr = in_aton(name);
+		memmove(name, cp, strlen(cp) + 1);
+	} else
+		addr = INADDR_NONE;
+
+	return addr;
+}
+
+/*
+ *	IP Autoconfig dispatcher.
+ */
+
+static int __init ip_auto_config(void)
+{
+	u32 addr;
+
+#ifdef CONFIG_PROC_FS
+	proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops);
+#endif /* CONFIG_PROC_FS */
+
+	if (!ic_enable)
+		return 0;
+
+	DBG(("IP-Config: Entered.\n"));
+#ifdef IPCONFIG_DYNAMIC
+ try_try_again:
+#endif
+	/* Give hardware a chance to settle */
+	msleep(CONF_PRE_OPEN);
+
+	/* Setup all network devices */
+	if (ic_open_devs() < 0)
+		return -1;
+
+	/* Give drivers a chance to settle */
+	ssleep(CONF_POST_OPEN);
+
+	/*
+	 * If the config information is insufficient (e.g., our IP address or
+	 * IP address of the boot server is missing or we have multiple network
+	 * interfaces and no default was set), use BOOTP or RARP to get the
+	 * missing values.
+	 */
+	if (ic_myaddr == INADDR_NONE ||
+#ifdef CONFIG_ROOT_NFS
+	    (MAJOR(ROOT_DEV) == UNNAMED_MAJOR
+	     && root_server_addr == INADDR_NONE
+	     && ic_servaddr == INADDR_NONE) ||
+#endif
+	    ic_first_dev->next) {
+#ifdef IPCONFIG_DYNAMIC
+	
+		int retries = CONF_OPEN_RETRIES;
+
+		if (ic_dynamic() < 0) {
+			ic_close_devs();
+
+			/*
+			 * I don't know why, but sometimes the
+			 * eepro100 driver (at least) gets upset and
+			 * doesn't work the first time it's opened.
+			 * But then if you close it and reopen it, it
+			 * works just fine.  So we need to try that at
+			 * least once before giving up.
+			 *
+			 * Also, if the root will be NFS-mounted, we
+			 * have nowhere to go if DHCP fails.  So we
+			 * just have to keep trying forever.
+			 *
+			 * 				-- Chip
+			 */
+#ifdef CONFIG_ROOT_NFS
+			if (ROOT_DEV ==  Root_NFS) {
+				printk(KERN_ERR 
+					"IP-Config: Retrying forever (NFS root)...\n");
+				goto try_try_again;
+			}
+#endif
+
+			if (--retries) {
+				printk(KERN_ERR 
+				       "IP-Config: Reopening network devices...\n");
+				goto try_try_again;
+			}
+
+			/* Oh, well.  At least we tried. */
+			printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n");
+			return -1;
+		}
+#else /* !DYNAMIC */
+		printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n");
+		ic_close_devs();
+		return -1;
+#endif /* IPCONFIG_DYNAMIC */
+	} else {
+		/* Device selected manually or only one device -> use it */
+		ic_dev = ic_first_dev->dev;
+	}
+
+	addr = root_nfs_parse_addr(root_server_path);
+	if (root_server_addr == INADDR_NONE)
+		root_server_addr = addr;
+
+	/*
+	 * Use defaults whereever applicable.
+	 */
+	if (ic_defaults() < 0)
+		return -1;
+
+	/*
+	 * Close all network devices except the device we've
+	 * autoconfigured and set up routes.
+	 */
+	ic_close_devs();
+	if (ic_setup_if() < 0 || ic_setup_routes() < 0)
+		return -1;
+
+	/*
+	 * Record which protocol was actually used.
+	 */
+#ifdef IPCONFIG_DYNAMIC
+	ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP);
+#endif
+
+#ifndef IPCONFIG_SILENT
+	/*
+	 * Clue in the operator.
+	 */
+	printk("IP-Config: Complete:");
+	printk("\n      device=%s", ic_dev->name);
+	printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr));
+	printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask));
+	printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway));
+	printk(",\n     host=%s, domain=%s, nis-domain=%s",
+	       system_utsname.nodename, ic_domain, system_utsname.domainname);
+	printk(",\n     bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr));
+	printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr));
+	printk(", rootpath=%s", root_server_path);
+	printk("\n");
+#endif /* !SILENT */
+
+	return 0;
+}
+
+late_initcall(ip_auto_config);
+
+
+/*
+ *  Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
+ *  command line parameter. It consists of option fields separated by colons in
+ *  the following order:
+ *
+ *  <client-ip>:<server-ip>:<gw-ip>:<netmask>:<host name>:<device>:<PROTO>
+ *
+ *  Any of the fields can be empty which means to use a default value:
+ *	<client-ip>	- address given by BOOTP or RARP
+ *	<server-ip>	- address of host returning BOOTP or RARP packet
+ *	<gw-ip>		- none, or the address returned by BOOTP
+ *	<netmask>	- automatically determined from <client-ip>, or the
+ *			  one returned by BOOTP
+ *	<host name>	- <client-ip> in ASCII notation, or the name returned
+ *			  by BOOTP
+ *	<device>	- use all available devices
+ *	<PROTO>:
+ *	   off|none	    - don't do autoconfig at all (DEFAULT)
+ *	   on|any           - use any configured protocol
+ *	   dhcp|bootp|rarp  - use only the specified protocol
+ *	   both             - use both BOOTP and RARP (not DHCP)
+ */
+static int __init ic_proto_name(char *name)
+{
+	if (!strcmp(name, "on") || !strcmp(name, "any")) {
+		return 1;
+	}
+#ifdef CONFIG_IP_PNP_DHCP
+	else if (!strcmp(name, "dhcp")) {
+		ic_proto_enabled &= ~IC_RARP;
+		return 1;
+	}
+#endif
+#ifdef CONFIG_IP_PNP_BOOTP
+	else if (!strcmp(name, "bootp")) {
+		ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP);
+		return 1;
+	}
+#endif
+#ifdef CONFIG_IP_PNP_RARP
+	else if (!strcmp(name, "rarp")) {
+		ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP);
+		return 1;
+	}
+#endif
+#ifdef IPCONFIG_DYNAMIC
+	else if (!strcmp(name, "both")) {
+		ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+static int __init ip_auto_config_setup(char *addrs)
+{
+	char *cp, *ip, *dp;
+	int num = 0;
+
+	ic_set_manually = 1;
+
+	ic_enable = (*addrs && 
+		(strcmp(addrs, "off") != 0) && 
+		(strcmp(addrs, "none") != 0));
+	if (!ic_enable)
+		return 1;
+
+	if (ic_proto_name(addrs))
+		return 1;
+
+	/* Parse the whole string */
+	ip = addrs;
+	while (ip && *ip) {
+		if ((cp = strchr(ip, ':')))
+			*cp++ = '\0';
+		if (strlen(ip) > 0) {
+			DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+			switch (num) {
+			case 0:
+				if ((ic_myaddr = in_aton(ip)) == INADDR_ANY)
+					ic_myaddr = INADDR_NONE;
+				break;
+			case 1:
+				if ((ic_servaddr = in_aton(ip)) == INADDR_ANY)
+					ic_servaddr = INADDR_NONE;
+				break;
+			case 2:
+				if ((ic_gateway = in_aton(ip)) == INADDR_ANY)
+					ic_gateway = INADDR_NONE;
+				break;
+			case 3:
+				if ((ic_netmask = in_aton(ip)) == INADDR_ANY)
+					ic_netmask = INADDR_NONE;
+				break;
+			case 4:
+				if ((dp = strchr(ip, '.'))) {
+					*dp++ = '\0';
+					strlcpy(system_utsname.domainname, dp,
+						sizeof(system_utsname.domainname));
+				}
+				strlcpy(system_utsname.nodename, ip,
+					sizeof(system_utsname.nodename));
+				ic_host_name_set = 1;
+				break;
+			case 5:
+				strlcpy(user_dev_name, ip, sizeof(user_dev_name));
+				break;
+			case 6:
+				ic_proto_name(ip);
+				break;
+			}
+		}
+		ip = cp;
+		num++;
+	}
+
+	return 1;
+}
+
+static int __init nfsaddrs_config_setup(char *addrs)
+{
+	return ip_auto_config_setup(addrs);
+}
+
+__setup("ip=", ip_auto_config_setup);
+__setup("nfsaddrs=", nfsaddrs_config_setup);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
new file mode 100644
index 000000000000..68a78731f722
--- /dev/null
+++ b/net/ipv4/ipip.c
@@ -0,0 +1,905 @@
+/*
+ *	Linux NET3:	IP/IP protocol decoder. 
+ *
+ *	Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $
+ *
+ *	Authors:
+ *		Sam Lantinga (slouken@cs.ucdavis.edu)  02/01/95
+ *
+ *	Fixes:
+ *		Alan Cox	:	Merged and made usable non modular (its so tiny its silly as
+ *					a module taking up 2 pages).
+ *		Alan Cox	: 	Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph)
+ *					to keep ip_forward happy.
+ *		Alan Cox	:	More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8).
+ *		Kai Schulte	:	Fixed #defines for IP_FIREWALL->FIREWALL
+ *              David Woodhouse :       Perform some basic ICMP handling.
+ *                                      IPIP Routing without decapsulation.
+ *              Carlos Picoto   :       GRE over IP support
+ *		Alexey Kuznetsov:	Reworked. Really, now it is truncated version of ipv4/ip_gre.c.
+ *					I do not want to merge them together.
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+/* tunnel.c: an IP tunnel driver
+
+	The purpose of this driver is to provide an IP tunnel through
+	which you can tunnel network traffic transparently across subnets.
+
+	This was written by looking at Nick Holloway's dummy driver
+	Thanks for the great code!
+
+		-Sam Lantinga	(slouken@cs.ucdavis.edu)  02/01/95
+		
+	Minor tweaks:
+		Cleaned up the code a little and added some pre-1.3.0 tweaks.
+		dev->hard_header/hard_header_len changed to use no headers.
+		Comments/bracketing tweaked.
+		Made the tunnels use dev->name not tunnel: when error reporting.
+		Added tx_dropped stat
+		
+		-Alan Cox	(Alan.Cox@linux.org) 21 March 95
+
+	Reworked:
+		Changed to tunnel to destination gateway in addition to the
+			tunnel's pointopoint address
+		Almost completely rewritten
+		Note:  There is currently no firewall or ICMP handling done.
+
+		-Sam Lantinga	(slouken@cs.ucdavis.edu) 02/13/96
+		
+*/
+
+/* Things I wish I had known when writing the tunnel driver:
+
+	When the tunnel_xmit() function is called, the skb contains the
+	packet to be sent (plus a great deal of extra info), and dev
+	contains the tunnel device that _we_ are.
+
+	When we are passed a packet, we are expected to fill in the
+	source address with our source IP address.
+
+	What is the proper way to allocate, copy and free a buffer?
+	After you allocate it, it is a "0 length" chunk of memory
+	starting at zero.  If you want to add headers to the buffer
+	later, you'll have to call "skb_reserve(skb, amount)" with
+	the amount of memory you want reserved.  Then, you call
+	"skb_put(skb, amount)" with the amount of space you want in
+	the buffer.  skb_put() returns a pointer to the top (#0) of
+	that buffer.  skb->len is set to the amount of space you have
+	"allocated" with skb_put().  You can then write up to skb->len
+	bytes to that buffer.  If you need more, you can call skb_put()
+	again with the additional amount of space you need.  You can
+	find out how much more space you can allocate by calling 
+	"skb_tailroom(skb)".
+	Now, to add header space, call "skb_push(skb, header_len)".
+	This creates space at the beginning of the buffer and returns
+	a pointer to this new space.  If later you need to strip a
+	header from a buffer, call "skb_pull(skb, header_len)".
+	skb_headroom() will return how much space is left at the top
+	of the buffer (before the main data).  Remember, this headroom
+	space must be reserved before the skb_put() function is called.
+	*/
+
+/*
+   This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c
+
+   For comments look at net/ipv4/ip_gre.c --ANK
+ */
+
+ 
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/if_arp.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/sock.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/ipip.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#define HASH_SIZE  16
+#define HASH(addr) ((addr^(addr>>4))&0xF)
+
+static int ipip_fb_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
+static void ipip_tunnel_setup(struct net_device *dev);
+
+static struct net_device *ipip_fb_tunnel_dev;
+
+static struct ip_tunnel *tunnels_r_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_r[HASH_SIZE];
+static struct ip_tunnel *tunnels_l[HASH_SIZE];
+static struct ip_tunnel *tunnels_wc[1];
+static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l };
+
+static DEFINE_RWLOCK(ipip_lock);
+
+static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local)
+{
+	unsigned h0 = HASH(remote);
+	unsigned h1 = HASH(local);
+	struct ip_tunnel *t;
+
+	for (t = tunnels_r_l[h0^h1]; t; t = t->next) {
+		if (local == t->parms.iph.saddr &&
+		    remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+			return t;
+	}
+	for (t = tunnels_r[h0]; t; t = t->next) {
+		if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
+			return t;
+	}
+	for (t = tunnels_l[h1]; t; t = t->next) {
+		if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
+			return t;
+	}
+	if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
+		return t;
+	return NULL;
+}
+
+static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t)
+{
+	u32 remote = t->parms.iph.daddr;
+	u32 local = t->parms.iph.saddr;
+	unsigned h = 0;
+	int prio = 0;
+
+	if (remote) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+	if (local) {
+		prio |= 1;
+		h ^= HASH(local);
+	}
+	return &tunnels[prio][h];
+}
+
+
+static void ipip_tunnel_unlink(struct ip_tunnel *t)
+{
+	struct ip_tunnel **tp;
+
+	for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) {
+		if (t == *tp) {
+			write_lock_bh(&ipip_lock);
+			*tp = t->next;
+			write_unlock_bh(&ipip_lock);
+			break;
+		}
+	}
+}
+
+static void ipip_tunnel_link(struct ip_tunnel *t)
+{
+	struct ip_tunnel **tp = ipip_bucket(t);
+
+	t->next = *tp;
+	write_lock_bh(&ipip_lock);
+	*tp = t;
+	write_unlock_bh(&ipip_lock);
+}
+
+static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create)
+{
+	u32 remote = parms->iph.daddr;
+	u32 local = parms->iph.saddr;
+	struct ip_tunnel *t, **tp, *nt;
+	struct net_device *dev;
+	unsigned h = 0;
+	int prio = 0;
+	char name[IFNAMSIZ];
+
+	if (remote) {
+		prio |= 2;
+		h ^= HASH(remote);
+	}
+	if (local) {
+		prio |= 1;
+		h ^= HASH(local);
+	}
+	for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) {
+		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
+			return t;
+	}
+	if (!create)
+		return NULL;
+
+	if (parms->name[0])
+		strlcpy(name, parms->name, IFNAMSIZ);
+	else {
+		int i;
+		for (i=1; i<100; i++) {
+			sprintf(name, "tunl%d", i);
+			if (__dev_get_by_name(name) == NULL)
+				break;
+		}
+		if (i==100)
+			goto failed;
+	}
+
+	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
+	if (dev == NULL)
+		return NULL;
+
+	nt = dev->priv;
+	SET_MODULE_OWNER(dev);
+	dev->init = ipip_tunnel_init;
+	nt->parms = *parms;
+
+	if (register_netdevice(dev) < 0) {
+		free_netdev(dev);
+		goto failed;
+	}
+
+	dev_hold(dev);
+	ipip_tunnel_link(nt);
+	/* Do not decrement MOD_USE_COUNT here. */
+	return nt;
+
+failed:
+	return NULL;
+}
+
+static void ipip_tunnel_uninit(struct net_device *dev)
+{
+	if (dev == ipip_fb_tunnel_dev) {
+		write_lock_bh(&ipip_lock);
+		tunnels_wc[0] = NULL;
+		write_unlock_bh(&ipip_lock);
+	} else
+		ipip_tunnel_unlink((struct ip_tunnel*)dev->priv);
+	dev_put(dev);
+}
+
+static void ipip_err(struct sk_buff *skb, void *__unused)
+{
+#ifndef I_WISH_WORLD_WERE_PERFECT
+
+/* It is not :-( All the routers (except for Linux) return only
+   8 bytes of packet payload. It means, that precise relaying of
+   ICMP in the real Internet is absolutely infeasible.
+ */
+	struct iphdr *iph = (struct iphdr*)skb->data;
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	struct ip_tunnel *t;
+
+	switch (type) {
+	default:
+	case ICMP_PARAMETERPROB:
+		return;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return;
+		case ICMP_FRAG_NEEDED:
+			/* Soft state for pmtu is maintained by IP core. */
+			return;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe they are just ether pollution. --ANK
+			 */
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return;
+		break;
+	}
+
+	read_lock(&ipip_lock);
+	t = ipip_tunnel_lookup(iph->daddr, iph->saddr);
+	if (t == NULL || t->parms.iph.daddr == 0)
+		goto out;
+	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+		goto out;
+
+	if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
+		t->err_count++;
+	else
+		t->err_count = 1;
+	t->err_time = jiffies;
+out:
+	read_unlock(&ipip_lock);
+	return;
+#else
+	struct iphdr *iph = (struct iphdr*)dp;
+	int hlen = iph->ihl<<2;
+	struct iphdr *eiph;
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	int rel_type = 0;
+	int rel_code = 0;
+	int rel_info = 0;
+	struct sk_buff *skb2;
+	struct flowi fl;
+	struct rtable *rt;
+
+	if (len < hlen + sizeof(struct iphdr))
+		return;
+	eiph = (struct iphdr*)(dp + hlen);
+
+	switch (type) {
+	default:
+		return;
+	case ICMP_PARAMETERPROB:
+		if (skb->h.icmph->un.gateway < hlen)
+			return;
+
+		/* So... This guy found something strange INSIDE encapsulated
+		   packet. Well, he is fool, but what can we do ?
+		 */
+		rel_type = ICMP_PARAMETERPROB;
+		rel_info = skb->h.icmph->un.gateway - hlen;
+		break;
+
+	case ICMP_DEST_UNREACH:
+		switch (code) {
+		case ICMP_SR_FAILED:
+		case ICMP_PORT_UNREACH:
+			/* Impossible event. */
+			return;
+		case ICMP_FRAG_NEEDED:
+			/* And it is the only really necessary thing :-) */
+			rel_info = ntohs(skb->h.icmph->un.frag.mtu);
+			if (rel_info < hlen+68)
+				return;
+			rel_info -= hlen;
+			/* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */
+			if (rel_info > ntohs(eiph->tot_len))
+				return;
+			break;
+		default:
+			/* All others are translated to HOST_UNREACH.
+			   rfc2003 contains "deep thoughts" about NET_UNREACH,
+			   I believe, it is just ether pollution. --ANK
+			 */
+			rel_type = ICMP_DEST_UNREACH;
+			rel_code = ICMP_HOST_UNREACH;
+			break;
+		}
+		break;
+	case ICMP_TIME_EXCEEDED:
+		if (code != ICMP_EXC_TTL)
+			return;
+		break;
+	}
+
+	/* Prepare fake skb to feed it to icmp_send */
+	skb2 = skb_clone(skb, GFP_ATOMIC);
+	if (skb2 == NULL)
+		return;
+	dst_release(skb2->dst);
+	skb2->dst = NULL;
+	skb_pull(skb2, skb->data - (u8*)eiph);
+	skb2->nh.raw = skb2->data;
+
+	/* Try to guess incoming interface */
+	memset(&fl, 0, sizeof(fl));
+	fl.fl4_daddr = eiph->saddr;
+	fl.fl4_tos = RT_TOS(eiph->tos);
+	fl.proto = IPPROTO_IPIP;
+	if (ip_route_output_key(&rt, &key)) {
+		kfree_skb(skb2);
+		return;
+	}
+	skb2->dev = rt->u.dst.dev;
+
+	/* route "incoming" packet */
+	if (rt->rt_flags&RTCF_LOCAL) {
+		ip_rt_put(rt);
+		rt = NULL;
+		fl.fl4_daddr = eiph->daddr;
+		fl.fl4_src = eiph->saddr;
+		fl.fl4_tos = eiph->tos;
+		if (ip_route_output_key(&rt, &fl) ||
+		    rt->u.dst.dev->type != ARPHRD_TUNNEL) {
+			ip_rt_put(rt);
+			kfree_skb(skb2);
+			return;
+		}
+	} else {
+		ip_rt_put(rt);
+		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) ||
+		    skb2->dst->dev->type != ARPHRD_TUNNEL) {
+			kfree_skb(skb2);
+			return;
+		}
+	}
+
+	/* change mtu on this route */
+	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+		if (rel_info > dst_mtu(skb2->dst)) {
+			kfree_skb(skb2);
+			return;
+		}
+		skb2->dst->ops->update_pmtu(skb2->dst, rel_info);
+		rel_info = htonl(rel_info);
+	} else if (type == ICMP_TIME_EXCEEDED) {
+		struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv;
+		if (t->parms.iph.ttl) {
+			rel_type = ICMP_DEST_UNREACH;
+			rel_code = ICMP_HOST_UNREACH;
+		}
+	}
+
+	icmp_send(skb2, rel_type, rel_code, rel_info);
+	kfree_skb(skb2);
+	return;
+#endif
+}
+
+static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb)
+{
+	struct iphdr *inner_iph = skb->nh.iph;
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	struct ip_tunnel *tunnel;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto out;
+
+	iph = skb->nh.iph;
+
+	read_lock(&ipip_lock);
+	if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) {
+		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+			read_unlock(&ipip_lock);
+			kfree_skb(skb);
+			return 0;
+		}
+
+		secpath_reset(skb);
+
+		skb->mac.raw = skb->nh.raw;
+		skb->nh.raw = skb->data;
+		memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+		skb->protocol = htons(ETH_P_IP);
+		skb->pkt_type = PACKET_HOST;
+
+		tunnel->stat.rx_packets++;
+		tunnel->stat.rx_bytes += skb->len;
+		skb->dev = tunnel->dev;
+		dst_release(skb->dst);
+		skb->dst = NULL;
+		nf_reset(skb);
+		ipip_ecn_decapsulate(iph, skb);
+		netif_rx(skb);
+		read_unlock(&ipip_lock);
+		return 0;
+	}
+	read_unlock(&ipip_lock);
+
+out:
+	return -1;
+}
+
+/*
+ *	This function assumes it is being called from dev_queue_xmit()
+ *	and that skb is filled properly by that function.
+ */
+
+static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv;
+	struct net_device_stats *stats = &tunnel->stat;
+	struct iphdr  *tiph = &tunnel->parms.iph;
+	u8     tos = tunnel->parms.iph.tos;
+	u16    df = tiph->frag_off;
+	struct rtable *rt;     			/* Route to the other host */
+	struct net_device *tdev;			/* Device to other host */
+	struct iphdr  *old_iph = skb->nh.iph;
+	struct iphdr  *iph;			/* Our new IP header */
+	int    max_headroom;			/* The extra header space needed */
+	u32    dst = tiph->daddr;
+	int    mtu;
+
+	if (tunnel->recursion++) {
+		tunnel->stat.collisions++;
+		goto tx_error;
+	}
+
+	if (skb->protocol != htons(ETH_P_IP))
+		goto tx_error;
+
+	if (tos&1)
+		tos = old_iph->tos;
+
+	if (!dst) {
+		/* NBMA tunnel */
+		if ((rt = (struct rtable*)skb->dst) == NULL) {
+			tunnel->stat.tx_fifo_errors++;
+			goto tx_error;
+		}
+		if ((dst = rt->rt_gateway) == 0)
+			goto tx_error_icmp;
+	}
+
+	{
+		struct flowi fl = { .oif = tunnel->parms.link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = dst,
+						.saddr = tiph->saddr,
+						.tos = RT_TOS(tos) } },
+				    .proto = IPPROTO_IPIP };
+		if (ip_route_output_key(&rt, &fl)) {
+			tunnel->stat.tx_carrier_errors++;
+			goto tx_error_icmp;
+		}
+	}
+	tdev = rt->u.dst.dev;
+
+	if (tdev == dev) {
+		ip_rt_put(rt);
+		tunnel->stat.collisions++;
+		goto tx_error;
+	}
+
+	if (tiph->frag_off)
+		mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+	else
+		mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
+
+	if (mtu < 68) {
+		tunnel->stat.collisions++;
+		ip_rt_put(rt);
+		goto tx_error;
+	}
+	if (skb->dst)
+		skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+	df |= (old_iph->frag_off&htons(IP_DF));
+
+	if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		goto tx_error;
+	}
+
+	if (tunnel->err_count > 0) {
+		if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
+			tunnel->err_count--;
+			dst_link_failure(skb);
+		} else
+			tunnel->err_count = 0;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr));
+
+	if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+  			stats->tx_dropped++;
+			dev_kfree_skb(skb);
+			tunnel->recursion--;
+			return 0;
+		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		dev_kfree_skb(skb);
+		skb = new_skb;
+		old_iph = skb->nh.iph;
+	}
+
+	skb->h.raw = skb->nh.raw;
+	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+
+	iph 			=	skb->nh.iph;
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPIP;
+	iph->tos		=	INET_ECN_encapsulate(tos, old_iph->tos);
+	iph->daddr		=	rt->rt_dst;
+	iph->saddr		=	rt->rt_src;
+
+	if ((iph->ttl = tiph->ttl) == 0)
+		iph->ttl	=	old_iph->ttl;
+
+	nf_reset(skb);
+
+	IPTUNNEL_XMIT();
+	tunnel->recursion--;
+	return 0;
+
+tx_error_icmp:
+	dst_link_failure(skb);
+tx_error:
+	stats->tx_errors++;
+	dev_kfree_skb(skb);
+	tunnel->recursion--;
+	return 0;
+}
+
+static int
+ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	int err = 0;
+	struct ip_tunnel_parm p;
+	struct ip_tunnel *t;
+
+	switch (cmd) {
+	case SIOCGETTUNNEL:
+		t = NULL;
+		if (dev == ipip_fb_tunnel_dev) {
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
+				err = -EFAULT;
+				break;
+			}
+			t = ipip_tunnel_locate(&p, 0);
+		}
+		if (t == NULL)
+			t = (struct ip_tunnel*)dev->priv;
+		memcpy(&p, &t->parms, sizeof(p));
+		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
+			err = -EFAULT;
+		break;
+
+	case SIOCADDTUNNEL:
+	case SIOCCHGTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		err = -EFAULT;
+		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+			goto done;
+
+		err = -EINVAL;
+		if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP ||
+		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
+			goto done;
+		if (p.iph.ttl)
+			p.iph.frag_off |= htons(IP_DF);
+
+		t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL);
+
+		if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
+			if (t != NULL) {
+				if (t->dev != dev) {
+					err = -EEXIST;
+					break;
+				}
+			} else {
+				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
+				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
+					err = -EINVAL;
+					break;
+				}
+				t = (struct ip_tunnel*)dev->priv;
+				ipip_tunnel_unlink(t);
+				t->parms.iph.saddr = p.iph.saddr;
+				t->parms.iph.daddr = p.iph.daddr;
+				memcpy(dev->dev_addr, &p.iph.saddr, 4);
+				memcpy(dev->broadcast, &p.iph.daddr, 4);
+				ipip_tunnel_link(t);
+				netdev_state_change(dev);
+			}
+		}
+
+		if (t) {
+			err = 0;
+			if (cmd == SIOCCHGTUNNEL) {
+				t->parms.iph.ttl = p.iph.ttl;
+				t->parms.iph.tos = p.iph.tos;
+				t->parms.iph.frag_off = p.iph.frag_off;
+			}
+			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
+				err = -EFAULT;
+		} else
+			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
+		break;
+
+	case SIOCDELTUNNEL:
+		err = -EPERM;
+		if (!capable(CAP_NET_ADMIN))
+			goto done;
+
+		if (dev == ipip_fb_tunnel_dev) {
+			err = -EFAULT;
+			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
+				goto done;
+			err = -ENOENT;
+			if ((t = ipip_tunnel_locate(&p, 0)) == NULL)
+				goto done;
+			err = -EPERM;
+			if (t->dev == ipip_fb_tunnel_dev)
+				goto done;
+			dev = t->dev;
+		}
+		err = unregister_netdevice(dev);
+		break;
+
+	default:
+		err = -EINVAL;
+	}
+
+done:
+	return err;
+}
+
+static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev)
+{
+	return &(((struct ip_tunnel*)dev->priv)->stat);
+}
+
+static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+	if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+static void ipip_tunnel_setup(struct net_device *dev)
+{
+	SET_MODULE_OWNER(dev);
+	dev->uninit		= ipip_tunnel_uninit;
+	dev->hard_start_xmit	= ipip_tunnel_xmit;
+	dev->get_stats		= ipip_tunnel_get_stats;
+	dev->do_ioctl		= ipip_tunnel_ioctl;
+	dev->change_mtu		= ipip_tunnel_change_mtu;
+	dev->destructor		= free_netdev;
+
+	dev->type		= ARPHRD_TUNNEL;
+	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
+	dev->mtu		= 1500 - sizeof(struct iphdr);
+	dev->flags		= IFF_NOARP;
+	dev->iflink		= 0;
+	dev->addr_len		= 4;
+}
+
+static int ipip_tunnel_init(struct net_device *dev)
+{
+	struct net_device *tdev = NULL;
+	struct ip_tunnel *tunnel;
+	struct iphdr *iph;
+
+	tunnel = (struct ip_tunnel*)dev->priv;
+	iph = &tunnel->parms.iph;
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
+	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
+
+	if (iph->daddr) {
+		struct flowi fl = { .oif = tunnel->parms.link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = iph->daddr,
+						.saddr = iph->saddr,
+						.tos = RT_TOS(iph->tos) } },
+				    .proto = IPPROTO_IPIP };
+		struct rtable *rt;
+		if (!ip_route_output_key(&rt, &fl)) {
+			tdev = rt->u.dst.dev;
+			ip_rt_put(rt);
+		}
+		dev->flags |= IFF_POINTOPOINT;
+	}
+
+	if (!tdev && tunnel->parms.link)
+		tdev = __dev_get_by_index(tunnel->parms.link);
+
+	if (tdev) {
+		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
+		dev->mtu = tdev->mtu - sizeof(struct iphdr);
+	}
+	dev->iflink = tunnel->parms.link;
+
+	return 0;
+}
+
+static int __init ipip_fb_tunnel_init(struct net_device *dev)
+{
+	struct ip_tunnel *tunnel = dev->priv;
+	struct iphdr *iph = &tunnel->parms.iph;
+
+	tunnel->dev = dev;
+	strcpy(tunnel->parms.name, dev->name);
+
+	iph->version		= 4;
+	iph->protocol		= IPPROTO_IPIP;
+	iph->ihl		= 5;
+
+	dev_hold(dev);
+	tunnels_wc[0]		= tunnel;
+	return 0;
+}
+
+static struct xfrm_tunnel ipip_handler = {
+	.handler	=	ipip_rcv,
+	.err_handler	=	ipip_err,
+};
+
+static char banner[] __initdata =
+	KERN_INFO "IPv4 over IPv4 tunneling driver\n";
+
+static int __init ipip_init(void)
+{
+	int err;
+
+	printk(banner);
+
+	if (xfrm4_tunnel_register(&ipip_handler) < 0) {
+		printk(KERN_INFO "ipip init: can't register tunnel\n");
+		return -EAGAIN;
+	}
+
+	ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel),
+					   "tunl0",
+					   ipip_tunnel_setup);
+	if (!ipip_fb_tunnel_dev) {
+		err = -ENOMEM;
+		goto err1;
+	}
+
+	ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init;
+
+	if ((err = register_netdev(ipip_fb_tunnel_dev)))
+		goto err2;
+ out:
+	return err;
+ err2:
+	free_netdev(ipip_fb_tunnel_dev);
+ err1:
+	xfrm4_tunnel_deregister(&ipip_handler);
+	goto out;
+}
+
+static void __exit ipip_fini(void)
+{
+	if (xfrm4_tunnel_deregister(&ipip_handler) < 0)
+		printk(KERN_INFO "ipip close: can't deregister tunnel\n");
+
+	unregister_netdev(ipip_fb_tunnel_dev);
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
new file mode 100644
index 000000000000..e21c049ec62a
--- /dev/null
+++ b/net/ipv4/ipmr.c
@@ -0,0 +1,1900 @@
+/*
+ *	IP multicast routing support for mrouted 3.6/3.8
+ *
+ *		(c) 1995 Alan Cox, <alan@redhat.com>
+ *	  Linux Consultancy and Custom Driver Development
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
+ *
+ *	Fixes:
+ *	Michael Chastain	:	Incorrect size of copying.
+ *	Alan Cox		:	Added the cache manager code
+ *	Alan Cox		:	Fixed the clone/copy bug and device race.
+ *	Mike McLagan		:	Routing by source
+ *	Malcolm Beattie		:	Buffer handling fixes.
+ *	Alexey Kuznetsov	:	Double buffer free and other fixes.
+ *	SVR Anand		:	Fixed several multicast bugs and problems.
+ *	Alexey Kuznetsov	:	Status, optimisations and more.
+ *	Brad Parker		:	Better behaviour on mrouted upcall
+ *					overflow.
+ *      Carlos Picoto           :       PIMv1 Support
+ *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
+ *					Relax this requrement to work with older peers.
+ *
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
+#define CONFIG_IP_PIMSM	1
+#endif
+
+static struct sock *mroute_socket;
+
+
+/* Big lock, protecting vif table, mrt cache and mroute socket state.
+   Note that the changes are semaphored via rtnl_lock.
+ */
+
+static DEFINE_RWLOCK(mrt_lock);
+
+/*
+ *	Multicast router control variables
+ */
+
+static struct vif_device vif_table[MAXVIFS];		/* Devices 		*/
+static int maxvif;
+
+#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
+
+static int mroute_do_assert;				/* Set in PIM assert	*/
+static int mroute_do_pim;
+
+static struct mfc_cache *mfc_cache_array[MFC_LINES];	/* Forwarding cache	*/
+
+static struct mfc_cache *mfc_unres_queue;		/* Queue of unresolved entries */
+static atomic_t cache_resolve_queue_len;		/* Size of unresolved	*/
+
+/* Special spinlock for queue of unresolved entries */
+static DEFINE_SPINLOCK(mfc_unres_lock);
+
+/* We return to original Alan's scheme. Hash table of resolved
+   entries is changed only in process context and protected
+   with weak lock mrt_lock. Queue of unresolved entries is protected
+   with strong spinlock mfc_unres_lock.
+
+   In this case data path is free of exclusive locks at all.
+ */
+
+static kmem_cache_t *mrt_cachep;
+
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
+static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
+
+#ifdef CONFIG_IP_PIMSM_V2
+static struct net_protocol pim_protocol;
+#endif
+
+static struct timer_list ipmr_expire_timer;
+
+/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
+
+static
+struct net_device *ipmr_new_tunnel(struct vifctl *v)
+{
+	struct net_device  *dev;
+
+	dev = __dev_get_by_name("tunl0");
+
+	if (dev) {
+		int err;
+		struct ifreq ifr;
+		mm_segment_t	oldfs;
+		struct ip_tunnel_parm p;
+		struct in_device  *in_dev;
+
+		memset(&p, 0, sizeof(p));
+		p.iph.daddr = v->vifc_rmt_addr.s_addr;
+		p.iph.saddr = v->vifc_lcl_addr.s_addr;
+		p.iph.version = 4;
+		p.iph.ihl = 5;
+		p.iph.protocol = IPPROTO_IPIP;
+		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
+		ifr.ifr_ifru.ifru_data = (void*)&p;
+
+		oldfs = get_fs(); set_fs(KERNEL_DS);
+		err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
+		set_fs(oldfs);
+
+		dev = NULL;
+
+		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
+			dev->flags |= IFF_MULTICAST;
+
+			in_dev = __in_dev_get(dev);
+			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
+				goto failure;
+			in_dev->cnf.rp_filter = 0;
+
+			if (dev_open(dev))
+				goto failure;
+		}
+	}
+	return dev;
+
+failure:
+	/* allow the register to be completed before unregistering. */
+	rtnl_unlock();
+	rtnl_lock();
+
+	unregister_netdevice(dev);
+	return NULL;
+}
+
+#ifdef CONFIG_IP_PIMSM
+
+static int reg_vif_num = -1;
+
+static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	read_lock(&mrt_lock);
+	((struct net_device_stats*)dev->priv)->tx_bytes += skb->len;
+	((struct net_device_stats*)dev->priv)->tx_packets++;
+	ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
+	read_unlock(&mrt_lock);
+	kfree_skb(skb);
+	return 0;
+}
+
+static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
+{
+	return (struct net_device_stats*)dev->priv;
+}
+
+static void reg_vif_setup(struct net_device *dev)
+{
+	dev->type		= ARPHRD_PIMREG;
+	dev->mtu		= 1500 - sizeof(struct iphdr) - 8;
+	dev->flags		= IFF_NOARP;
+	dev->hard_start_xmit	= reg_vif_xmit;
+	dev->get_stats		= reg_vif_get_stats;
+	dev->destructor		= free_netdev;
+}
+
+static struct net_device *ipmr_reg_vif(void)
+{
+	struct net_device *dev;
+	struct in_device *in_dev;
+
+	dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
+			   reg_vif_setup);
+
+	if (dev == NULL)
+		return NULL;
+
+	if (register_netdevice(dev)) {
+		free_netdev(dev);
+		return NULL;
+	}
+	dev->iflink = 0;
+
+	if ((in_dev = inetdev_init(dev)) == NULL)
+		goto failure;
+
+	in_dev->cnf.rp_filter = 0;
+
+	if (dev_open(dev))
+		goto failure;
+
+	return dev;
+
+failure:
+	/* allow the register to be completed before unregistering. */
+	rtnl_unlock();
+	rtnl_lock();
+
+	unregister_netdevice(dev);
+	return NULL;
+}
+#endif
+
+/*
+ *	Delete a VIF entry
+ */
+ 
+static int vif_delete(int vifi)
+{
+	struct vif_device *v;
+	struct net_device *dev;
+	struct in_device *in_dev;
+
+	if (vifi < 0 || vifi >= maxvif)
+		return -EADDRNOTAVAIL;
+
+	v = &vif_table[vifi];
+
+	write_lock_bh(&mrt_lock);
+	dev = v->dev;
+	v->dev = NULL;
+
+	if (!dev) {
+		write_unlock_bh(&mrt_lock);
+		return -EADDRNOTAVAIL;
+	}
+
+#ifdef CONFIG_IP_PIMSM
+	if (vifi == reg_vif_num)
+		reg_vif_num = -1;
+#endif
+
+	if (vifi+1 == maxvif) {
+		int tmp;
+		for (tmp=vifi-1; tmp>=0; tmp--) {
+			if (VIF_EXISTS(tmp))
+				break;
+		}
+		maxvif = tmp+1;
+	}
+
+	write_unlock_bh(&mrt_lock);
+
+	dev_set_allmulti(dev, -1);
+
+	if ((in_dev = __in_dev_get(dev)) != NULL) {
+		in_dev->cnf.mc_forwarding--;
+		ip_rt_multicast_event(in_dev);
+	}
+
+	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+		unregister_netdevice(dev);
+
+	dev_put(dev);
+	return 0;
+}
+
+/* Destroy an unresolved cache entry, killing queued skbs
+   and reporting error to netlink readers.
+ */
+
+static void ipmr_destroy_unres(struct mfc_cache *c)
+{
+	struct sk_buff *skb;
+
+	atomic_dec(&cache_resolve_queue_len);
+
+	while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
+		if (skb->nh.iph->version == 0) {
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+			nlh->nlmsg_type = NLMSG_ERROR;
+			nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+			skb_trim(skb, nlh->nlmsg_len);
+			((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT;
+			netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+		} else
+			kfree_skb(skb);
+	}
+
+	kmem_cache_free(mrt_cachep, c);
+}
+
+
+/* Single timer process for all the unresolved queue. */
+
+static void ipmr_expire_process(unsigned long dummy)
+{
+	unsigned long now;
+	unsigned long expires;
+	struct mfc_cache *c, **cp;
+
+	if (!spin_trylock(&mfc_unres_lock)) {
+		mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
+		return;
+	}
+
+	if (atomic_read(&cache_resolve_queue_len) == 0)
+		goto out;
+
+	now = jiffies;
+	expires = 10*HZ;
+	cp = &mfc_unres_queue;
+
+	while ((c=*cp) != NULL) {
+		if (time_after(c->mfc_un.unres.expires, now)) {
+			unsigned long interval = c->mfc_un.unres.expires - now;
+			if (interval < expires)
+				expires = interval;
+			cp = &c->next;
+			continue;
+		}
+
+		*cp = c->next;
+
+		ipmr_destroy_unres(c);
+	}
+
+	if (atomic_read(&cache_resolve_queue_len))
+		mod_timer(&ipmr_expire_timer, jiffies + expires);
+
+out:
+	spin_unlock(&mfc_unres_lock);
+}
+
+/* Fill oifs list. It is called under write locked mrt_lock. */
+
+static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls)
+{
+	int vifi;
+
+	cache->mfc_un.res.minvif = MAXVIFS;
+	cache->mfc_un.res.maxvif = 0;
+	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
+
+	for (vifi=0; vifi<maxvif; vifi++) {
+		if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
+			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
+			if (cache->mfc_un.res.minvif > vifi)
+				cache->mfc_un.res.minvif = vifi;
+			if (cache->mfc_un.res.maxvif <= vifi)
+				cache->mfc_un.res.maxvif = vifi + 1;
+		}
+	}
+}
+
+static int vif_add(struct vifctl *vifc, int mrtsock)
+{
+	int vifi = vifc->vifc_vifi;
+	struct vif_device *v = &vif_table[vifi];
+	struct net_device *dev;
+	struct in_device *in_dev;
+
+	/* Is vif busy ? */
+	if (VIF_EXISTS(vifi))
+		return -EADDRINUSE;
+
+	switch (vifc->vifc_flags) {
+#ifdef CONFIG_IP_PIMSM
+	case VIFF_REGISTER:
+		/*
+		 * Special Purpose VIF in PIM
+		 * All the packets will be sent to the daemon
+		 */
+		if (reg_vif_num >= 0)
+			return -EADDRINUSE;
+		dev = ipmr_reg_vif();
+		if (!dev)
+			return -ENOBUFS;
+		break;
+#endif
+	case VIFF_TUNNEL:	
+		dev = ipmr_new_tunnel(vifc);
+		if (!dev)
+			return -ENOBUFS;
+		break;
+	case 0:
+		dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr);
+		if (!dev)
+			return -EADDRNOTAVAIL;
+		__dev_put(dev);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if ((in_dev = __in_dev_get(dev)) == NULL)
+		return -EADDRNOTAVAIL;
+	in_dev->cnf.mc_forwarding++;
+	dev_set_allmulti(dev, +1);
+	ip_rt_multicast_event(in_dev);
+
+	/*
+	 *	Fill in the VIF structures
+	 */
+	v->rate_limit=vifc->vifc_rate_limit;
+	v->local=vifc->vifc_lcl_addr.s_addr;
+	v->remote=vifc->vifc_rmt_addr.s_addr;
+	v->flags=vifc->vifc_flags;
+	if (!mrtsock)
+		v->flags |= VIFF_STATIC;
+	v->threshold=vifc->vifc_threshold;
+	v->bytes_in = 0;
+	v->bytes_out = 0;
+	v->pkt_in = 0;
+	v->pkt_out = 0;
+	v->link = dev->ifindex;
+	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+		v->link = dev->iflink;
+
+	/* And finish update writing critical data */
+	write_lock_bh(&mrt_lock);
+	dev_hold(dev);
+	v->dev=dev;
+#ifdef CONFIG_IP_PIMSM
+	if (v->flags&VIFF_REGISTER)
+		reg_vif_num = vifi;
+#endif
+	if (vifi+1 > maxvif)
+		maxvif = vifi+1;
+	write_unlock_bh(&mrt_lock);
+	return 0;
+}
+
+static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp)
+{
+	int line=MFC_HASH(mcastgrp,origin);
+	struct mfc_cache *c;
+
+	for (c=mfc_cache_array[line]; c; c = c->next) {
+		if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
+			break;
+	}
+	return c;
+}
+
+/*
+ *	Allocate a multicast cache entry
+ */
+static struct mfc_cache *ipmr_cache_alloc(void)
+{
+	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL);
+	if(c==NULL)
+		return NULL;
+	memset(c, 0, sizeof(*c));
+	c->mfc_un.res.minvif = MAXVIFS;
+	return c;
+}
+
+static struct mfc_cache *ipmr_cache_alloc_unres(void)
+{
+	struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC);
+	if(c==NULL)
+		return NULL;
+	memset(c, 0, sizeof(*c));
+	skb_queue_head_init(&c->mfc_un.unres.unresolved);
+	c->mfc_un.unres.expires = jiffies + 10*HZ;
+	return c;
+}
+
+/*
+ *	A cache entry has gone into a resolved state from queued
+ */
+ 
+static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
+{
+	struct sk_buff *skb;
+
+	/*
+	 *	Play the pending entries through our router
+	 */
+
+	while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
+		if (skb->nh.iph->version == 0) {
+			int err;
+			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
+
+			if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
+				nlh->nlmsg_len = skb->tail - (u8*)nlh;
+			} else {
+				nlh->nlmsg_type = NLMSG_ERROR;
+				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
+				skb_trim(skb, nlh->nlmsg_len);
+				((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE;
+			}
+			err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT);
+		} else
+			ip_mr_forward(skb, c, 0);
+	}
+}
+
+/*
+ *	Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ *	expects the following bizarre scheme.
+ *
+ *	Called under mrt_lock.
+ */
+ 
+static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
+{
+	struct sk_buff *skb;
+	int ihl = pkt->nh.iph->ihl<<2;
+	struct igmphdr *igmp;
+	struct igmpmsg *msg;
+	int ret;
+
+#ifdef CONFIG_IP_PIMSM
+	if (assert == IGMPMSG_WHOLEPKT)
+		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
+	else
+#endif
+		skb = alloc_skb(128, GFP_ATOMIC);
+
+	if(!skb)
+		return -ENOBUFS;
+
+#ifdef CONFIG_IP_PIMSM
+	if (assert == IGMPMSG_WHOLEPKT) {
+		/* Ugly, but we have no choice with this interface.
+		   Duplicate old header, fix ihl, length etc.
+		   And all this only to mangle msg->im_msgtype and
+		   to set msg->im_mbz to "mbz" :-)
+		 */
+		msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr));
+		skb->nh.raw = skb->h.raw = (u8*)msg;
+		memcpy(msg, pkt->nh.raw, sizeof(struct iphdr));
+		msg->im_msgtype = IGMPMSG_WHOLEPKT;
+		msg->im_mbz = 0;
+ 		msg->im_vif = reg_vif_num;
+		skb->nh.iph->ihl = sizeof(struct iphdr) >> 2;
+		skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr));
+	} else 
+#endif
+	{	
+		
+	/*
+	 *	Copy the IP header
+	 */
+
+	skb->nh.iph = (struct iphdr *)skb_put(skb, ihl);
+	memcpy(skb->data,pkt->data,ihl);
+	skb->nh.iph->protocol = 0;			/* Flag to the kernel this is a route add */
+	msg = (struct igmpmsg*)skb->nh.iph;
+	msg->im_vif = vifi;
+	skb->dst = dst_clone(pkt->dst);
+
+	/*
+	 *	Add our header
+	 */
+
+	igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
+	igmp->type	=
+	msg->im_msgtype = assert;
+	igmp->code 	=	0;
+	skb->nh.iph->tot_len=htons(skb->len);			/* Fix the length */
+	skb->h.raw = skb->nh.raw;
+        }
+
+	if (mroute_socket == NULL) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+
+	/*
+	 *	Deliver to mrouted
+	 */
+	if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
+		kfree_skb(skb);
+	}
+
+	return ret;
+}
+
+/*
+ *	Queue a packet for resolution. It gets locked cache entry!
+ */
+ 
+static int
+ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
+{
+	int err;
+	struct mfc_cache *c;
+
+	spin_lock_bh(&mfc_unres_lock);
+	for (c=mfc_unres_queue; c; c=c->next) {
+		if (c->mfc_mcastgrp == skb->nh.iph->daddr &&
+		    c->mfc_origin == skb->nh.iph->saddr)
+			break;
+	}
+
+	if (c == NULL) {
+		/*
+		 *	Create a new entry if allowable
+		 */
+
+		if (atomic_read(&cache_resolve_queue_len)>=10 ||
+		    (c=ipmr_cache_alloc_unres())==NULL) {
+			spin_unlock_bh(&mfc_unres_lock);
+
+			kfree_skb(skb);
+			return -ENOBUFS;
+		}
+
+		/*
+		 *	Fill in the new cache entry
+		 */
+		c->mfc_parent=-1;
+		c->mfc_origin=skb->nh.iph->saddr;
+		c->mfc_mcastgrp=skb->nh.iph->daddr;
+
+		/*
+		 *	Reflect first query at mrouted.
+		 */
+		if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
+			/* If the report failed throw the cache entry 
+			   out - Brad Parker
+			 */
+			spin_unlock_bh(&mfc_unres_lock);
+
+			kmem_cache_free(mrt_cachep, c);
+			kfree_skb(skb);
+			return err;
+		}
+
+		atomic_inc(&cache_resolve_queue_len);
+		c->next = mfc_unres_queue;
+		mfc_unres_queue = c;
+
+		mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
+	}
+
+	/*
+	 *	See if we can append the packet
+	 */
+	if (c->mfc_un.unres.unresolved.qlen>3) {
+		kfree_skb(skb);
+		err = -ENOBUFS;
+	} else {
+		skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
+		err = 0;
+	}
+
+	spin_unlock_bh(&mfc_unres_lock);
+	return err;
+}
+
+/*
+ *	MFC cache manipulation by user space mroute daemon
+ */
+
+static int ipmr_mfc_delete(struct mfcctl *mfc)
+{
+	int line;
+	struct mfc_cache *c, **cp;
+
+	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
+			write_lock_bh(&mrt_lock);
+			*cp = c->next;
+			write_unlock_bh(&mrt_lock);
+
+			kmem_cache_free(mrt_cachep, c);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
+{
+	int line;
+	struct mfc_cache *uc, *c, **cp;
+
+	line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
+
+	for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
+		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
+		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
+			break;
+	}
+
+	if (c != NULL) {
+		write_lock_bh(&mrt_lock);
+		c->mfc_parent = mfc->mfcc_parent;
+		ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+		if (!mrtsock)
+			c->mfc_flags |= MFC_STATIC;
+		write_unlock_bh(&mrt_lock);
+		return 0;
+	}
+
+	if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
+		return -EINVAL;
+
+	c=ipmr_cache_alloc();
+	if (c==NULL)
+		return -ENOMEM;
+
+	c->mfc_origin=mfc->mfcc_origin.s_addr;
+	c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
+	c->mfc_parent=mfc->mfcc_parent;
+	ipmr_update_threshoulds(c, mfc->mfcc_ttls);
+	if (!mrtsock)
+		c->mfc_flags |= MFC_STATIC;
+
+	write_lock_bh(&mrt_lock);
+	c->next = mfc_cache_array[line];
+	mfc_cache_array[line] = c;
+	write_unlock_bh(&mrt_lock);
+
+	/*
+	 *	Check to see if we resolved a queued list. If so we
+	 *	need to send on the frames and tidy up.
+	 */
+	spin_lock_bh(&mfc_unres_lock);
+	for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
+	     cp = &uc->next) {
+		if (uc->mfc_origin == c->mfc_origin &&
+		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
+			*cp = uc->next;
+			if (atomic_dec_and_test(&cache_resolve_queue_len))
+				del_timer(&ipmr_expire_timer);
+			break;
+		}
+	}
+	spin_unlock_bh(&mfc_unres_lock);
+
+	if (uc) {
+		ipmr_cache_resolve(uc, c);
+		kmem_cache_free(mrt_cachep, uc);
+	}
+	return 0;
+}
+
+/*
+ *	Close the multicast socket, and clear the vif tables etc
+ */
+ 
+static void mroute_clean_tables(struct sock *sk)
+{
+	int i;
+		
+	/*
+	 *	Shut down all active vif entries
+	 */
+	for(i=0; i<maxvif; i++) {
+		if (!(vif_table[i].flags&VIFF_STATIC))
+			vif_delete(i);
+	}
+
+	/*
+	 *	Wipe the cache
+	 */
+	for (i=0;i<MFC_LINES;i++) {
+		struct mfc_cache *c, **cp;
+
+		cp = &mfc_cache_array[i];
+		while ((c = *cp) != NULL) {
+			if (c->mfc_flags&MFC_STATIC) {
+				cp = &c->next;
+				continue;
+			}
+			write_lock_bh(&mrt_lock);
+			*cp = c->next;
+			write_unlock_bh(&mrt_lock);
+
+			kmem_cache_free(mrt_cachep, c);
+		}
+	}
+
+	if (atomic_read(&cache_resolve_queue_len) != 0) {
+		struct mfc_cache *c;
+
+		spin_lock_bh(&mfc_unres_lock);
+		while (mfc_unres_queue != NULL) {
+			c = mfc_unres_queue;
+			mfc_unres_queue = c->next;
+			spin_unlock_bh(&mfc_unres_lock);
+
+			ipmr_destroy_unres(c);
+
+			spin_lock_bh(&mfc_unres_lock);
+		}
+		spin_unlock_bh(&mfc_unres_lock);
+	}
+}
+
+static void mrtsock_destruct(struct sock *sk)
+{
+	rtnl_lock();
+	if (sk == mroute_socket) {
+		ipv4_devconf.mc_forwarding--;
+
+		write_lock_bh(&mrt_lock);
+		mroute_socket=NULL;
+		write_unlock_bh(&mrt_lock);
+
+		mroute_clean_tables(sk);
+	}
+	rtnl_unlock();
+}
+
+/*
+ *	Socket options and virtual interface manipulation. The whole
+ *	virtual interface system is a complete heap, but unfortunately
+ *	that's how BSD mrouted happens to think. Maybe one day with a proper
+ *	MOSPF/PIM router set up we can clean this up.
+ */
+ 
+int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
+{
+	int ret;
+	struct vifctl vif;
+	struct mfcctl mfc;
+	
+	if(optname!=MRT_INIT)
+	{
+		if(sk!=mroute_socket && !capable(CAP_NET_ADMIN))
+			return -EACCES;
+	}
+
+	switch(optname)
+	{
+		case MRT_INIT:
+			if (sk->sk_type != SOCK_RAW ||
+			    inet_sk(sk)->num != IPPROTO_IGMP)
+				return -EOPNOTSUPP;
+			if(optlen!=sizeof(int))
+				return -ENOPROTOOPT;
+
+			rtnl_lock();
+			if (mroute_socket) {
+				rtnl_unlock();
+				return -EADDRINUSE;
+			}
+
+			ret = ip_ra_control(sk, 1, mrtsock_destruct);
+			if (ret == 0) {
+				write_lock_bh(&mrt_lock);
+				mroute_socket=sk;
+				write_unlock_bh(&mrt_lock);
+
+				ipv4_devconf.mc_forwarding++;
+			}
+			rtnl_unlock();
+			return ret;
+		case MRT_DONE:
+			if (sk!=mroute_socket)
+				return -EACCES;
+			return ip_ra_control(sk, 0, NULL);
+		case MRT_ADD_VIF:
+		case MRT_DEL_VIF:
+			if(optlen!=sizeof(vif))
+				return -EINVAL;
+			if (copy_from_user(&vif,optval,sizeof(vif)))
+				return -EFAULT; 
+			if(vif.vifc_vifi >= MAXVIFS)
+				return -ENFILE;
+			rtnl_lock();
+			if (optname==MRT_ADD_VIF) {
+				ret = vif_add(&vif, sk==mroute_socket);
+			} else {
+				ret = vif_delete(vif.vifc_vifi);
+			}
+			rtnl_unlock();
+			return ret;
+
+		/*
+		 *	Manipulate the forwarding caches. These live
+		 *	in a sort of kernel/user symbiosis.
+		 */
+		case MRT_ADD_MFC:
+		case MRT_DEL_MFC:
+			if(optlen!=sizeof(mfc))
+				return -EINVAL;
+			if (copy_from_user(&mfc,optval, sizeof(mfc)))
+				return -EFAULT;
+			rtnl_lock();
+			if (optname==MRT_DEL_MFC)
+				ret = ipmr_mfc_delete(&mfc);
+			else
+				ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
+			rtnl_unlock();
+			return ret;
+		/*
+		 *	Control PIM assert.
+		 */
+		case MRT_ASSERT:
+		{
+			int v;
+			if(get_user(v,(int __user *)optval))
+				return -EFAULT;
+			mroute_do_assert=(v)?1:0;
+			return 0;
+		}
+#ifdef CONFIG_IP_PIMSM
+		case MRT_PIM:
+		{
+			int v, ret;
+			if(get_user(v,(int __user *)optval))
+				return -EFAULT;
+			v = (v)?1:0;
+			rtnl_lock();
+			ret = 0;
+			if (v != mroute_do_pim) {
+				mroute_do_pim = v;
+				mroute_do_assert = v;
+#ifdef CONFIG_IP_PIMSM_V2
+				if (mroute_do_pim)
+					ret = inet_add_protocol(&pim_protocol,
+								IPPROTO_PIM);
+				else
+					ret = inet_del_protocol(&pim_protocol,
+								IPPROTO_PIM);
+				if (ret < 0)
+					ret = -EAGAIN;
+#endif
+			}
+			rtnl_unlock();
+			return ret;
+		}
+#endif
+		/*
+		 *	Spurious command, or MRT_VERSION which you cannot
+		 *	set.
+		 */
+		default:
+			return -ENOPROTOOPT;
+	}
+}
+
+/*
+ *	Getsock opt support for the multicast routing system.
+ */
+ 
+int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
+{
+	int olr;
+	int val;
+
+	if(optname!=MRT_VERSION && 
+#ifdef CONFIG_IP_PIMSM
+	   optname!=MRT_PIM &&
+#endif
+	   optname!=MRT_ASSERT)
+		return -ENOPROTOOPT;
+
+	if (get_user(olr, optlen))
+		return -EFAULT;
+
+	olr = min_t(unsigned int, olr, sizeof(int));
+	if (olr < 0)
+		return -EINVAL;
+		
+	if(put_user(olr,optlen))
+		return -EFAULT;
+	if(optname==MRT_VERSION)
+		val=0x0305;
+#ifdef CONFIG_IP_PIMSM
+	else if(optname==MRT_PIM)
+		val=mroute_do_pim;
+#endif
+	else
+		val=mroute_do_assert;
+	if(copy_to_user(optval,&val,olr))
+		return -EFAULT;
+	return 0;
+}
+
+/*
+ *	The IP multicast ioctl support routines.
+ */
+ 
+int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
+{
+	struct sioc_sg_req sr;
+	struct sioc_vif_req vr;
+	struct vif_device *vif;
+	struct mfc_cache *c;
+	
+	switch(cmd)
+	{
+		case SIOCGETVIFCNT:
+			if (copy_from_user(&vr,arg,sizeof(vr)))
+				return -EFAULT; 
+			if(vr.vifi>=maxvif)
+				return -EINVAL;
+			read_lock(&mrt_lock);
+			vif=&vif_table[vr.vifi];
+			if(VIF_EXISTS(vr.vifi))	{
+				vr.icount=vif->pkt_in;
+				vr.ocount=vif->pkt_out;
+				vr.ibytes=vif->bytes_in;
+				vr.obytes=vif->bytes_out;
+				read_unlock(&mrt_lock);
+
+				if (copy_to_user(arg,&vr,sizeof(vr)))
+					return -EFAULT;
+				return 0;
+			}
+			read_unlock(&mrt_lock);
+			return -EADDRNOTAVAIL;
+		case SIOCGETSGCNT:
+			if (copy_from_user(&sr,arg,sizeof(sr)))
+				return -EFAULT;
+
+			read_lock(&mrt_lock);
+			c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
+			if (c) {
+				sr.pktcnt = c->mfc_un.res.pkt;
+				sr.bytecnt = c->mfc_un.res.bytes;
+				sr.wrong_if = c->mfc_un.res.wrong_if;
+				read_unlock(&mrt_lock);
+
+				if (copy_to_user(arg,&sr,sizeof(sr)))
+					return -EFAULT;
+				return 0;
+			}
+			read_unlock(&mrt_lock);
+			return -EADDRNOTAVAIL;
+		default:
+			return -ENOIOCTLCMD;
+	}
+}
+
+
+static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct vif_device *v;
+	int ct;
+	if (event != NETDEV_UNREGISTER)
+		return NOTIFY_DONE;
+	v=&vif_table[0];
+	for(ct=0;ct<maxvif;ct++,v++) {
+		if (v->dev==ptr)
+			vif_delete(ct);
+	}
+	return NOTIFY_DONE;
+}
+
+
+static struct notifier_block ip_mr_notifier={
+	.notifier_call = ipmr_device_event,
+};
+
+/*
+ * 	Encapsulate a packet by attaching a valid IPIP header to it.
+ *	This avoids tunnel drivers and other mess and gives us the speed so
+ *	important for multicast video.
+ */
+ 
+static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr)
+{
+	struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr));
+
+	iph->version	= 	4;
+	iph->tos	=	skb->nh.iph->tos;
+	iph->ttl	=	skb->nh.iph->ttl;
+	iph->frag_off	=	0;
+	iph->daddr	=	daddr;
+	iph->saddr	=	saddr;
+	iph->protocol	=	IPPROTO_IPIP;
+	iph->ihl	=	5;
+	iph->tot_len	=	htons(skb->len);
+	ip_select_ident(iph, skb->dst, NULL);
+	ip_send_check(iph);
+
+	skb->h.ipiph = skb->nh.iph;
+	skb->nh.iph = iph;
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	nf_reset(skb);
+}
+
+static inline int ipmr_forward_finish(struct sk_buff *skb)
+{
+	struct ip_options * opt	= &(IPCB(skb)->opt);
+
+	IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
+
+	if (unlikely(opt->optlen))
+		ip_forward_options(skb);
+
+	return dst_output(skb);
+}
+
+/*
+ *	Processing handlers for ipmr_forward
+ */
+
+static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct vif_device *vif = &vif_table[vifi];
+	struct net_device *dev;
+	struct rtable *rt;
+	int    encap = 0;
+
+	if (vif->dev == NULL)
+		goto out_free;
+
+#ifdef CONFIG_IP_PIMSM
+	if (vif->flags & VIFF_REGISTER) {
+		vif->pkt_out++;
+		vif->bytes_out+=skb->len;
+		((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len;
+		((struct net_device_stats*)vif->dev->priv)->tx_packets++;
+		ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
+		kfree_skb(skb);
+		return;
+	}
+#endif
+
+	if (vif->flags&VIFF_TUNNEL) {
+		struct flowi fl = { .oif = vif->link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = vif->remote,
+						.saddr = vif->local,
+						.tos = RT_TOS(iph->tos) } },
+				    .proto = IPPROTO_IPIP };
+		if (ip_route_output_key(&rt, &fl))
+			goto out_free;
+		encap = sizeof(struct iphdr);
+	} else {
+		struct flowi fl = { .oif = vif->link,
+				    .nl_u = { .ip4_u =
+					      { .daddr = iph->daddr,
+						.tos = RT_TOS(iph->tos) } },
+				    .proto = IPPROTO_IPIP };
+		if (ip_route_output_key(&rt, &fl))
+			goto out_free;
+	}
+
+	dev = rt->u.dst.dev;
+
+	if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
+		/* Do not fragment multicasts. Alas, IPv4 does not
+		   allow to send ICMP, so that packets will disappear
+		   to blackhole.
+		 */
+
+		IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
+		ip_rt_put(rt);
+		goto out_free;
+	}
+
+	encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
+
+	if (skb_cow(skb, encap)) {
+ 		ip_rt_put(rt);
+		goto out_free;
+	}
+
+	vif->pkt_out++;
+	vif->bytes_out+=skb->len;
+
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+	iph = skb->nh.iph;
+	ip_decrease_ttl(iph);
+
+	/* FIXME: forward and output firewalls used to be called here.
+	 * What do we do with netfilter? -- RR */
+	if (vif->flags & VIFF_TUNNEL) {
+		ip_encap(skb, vif->local, vif->remote);
+		/* FIXME: extra output firewall step used to be here. --RR */
+		((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++;
+		((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len;
+	}
+
+	IPCB(skb)->flags |= IPSKB_FORWARDED;
+
+	/*
+	 * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+	 * not only before forwarding, but after forwarding on all output
+	 * interfaces. It is clear, if mrouter runs a multicasting
+	 * program, it should receive packets not depending to what interface
+	 * program is joined.
+	 * If we will not make it, the program will have to join on all
+	 * interfaces. On the other hand, multihoming host (or router, but
+	 * not mrouter) cannot join to more than one interface - it will
+	 * result in receiving multiple packets.
+	 */
+	NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, 
+		ipmr_forward_finish);
+	return;
+
+out_free:
+	kfree_skb(skb);
+	return;
+}
+
+static int ipmr_find_vif(struct net_device *dev)
+{
+	int ct;
+	for (ct=maxvif-1; ct>=0; ct--) {
+		if (vif_table[ct].dev == dev)
+			break;
+	}
+	return ct;
+}
+
+/* "local" means that we should preserve one skb (for local delivery) */
+
+static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
+{
+	int psend = -1;
+	int vif, ct;
+
+	vif = cache->mfc_parent;
+	cache->mfc_un.res.pkt++;
+	cache->mfc_un.res.bytes += skb->len;
+
+	/*
+	 * Wrong interface: drop packet and (maybe) send PIM assert.
+	 */
+	if (vif_table[vif].dev != skb->dev) {
+		int true_vifi;
+
+		if (((struct rtable*)skb->dst)->fl.iif == 0) {
+			/* It is our own packet, looped back.
+			   Very complicated situation...
+
+			   The best workaround until routing daemons will be
+			   fixed is not to redistribute packet, if it was
+			   send through wrong interface. It means, that
+			   multicast applications WILL NOT work for
+			   (S,G), which have default multicast route pointing
+			   to wrong oif. In any case, it is not a good
+			   idea to use multicasting applications on router.
+			 */
+			goto dont_forward;
+		}
+
+		cache->mfc_un.res.wrong_if++;
+		true_vifi = ipmr_find_vif(skb->dev);
+
+		if (true_vifi >= 0 && mroute_do_assert &&
+		    /* pimsm uses asserts, when switching from RPT to SPT,
+		       so that we cannot check that packet arrived on an oif.
+		       It is bad, but otherwise we would need to move pretty
+		       large chunk of pimd to kernel. Ough... --ANK
+		     */
+		    (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
+		    time_after(jiffies, 
+			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
+			cache->mfc_un.res.last_assert = jiffies;
+			ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
+		}
+		goto dont_forward;
+	}
+
+	vif_table[vif].pkt_in++;
+	vif_table[vif].bytes_in+=skb->len;
+
+	/*
+	 *	Forward the frame
+	 */
+	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+		if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) {
+			if (psend != -1) {
+				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+				if (skb2)
+					ipmr_queue_xmit(skb2, cache, psend);
+			}
+			psend=ct;
+		}
+	}
+	if (psend != -1) {
+		if (local) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			if (skb2)
+				ipmr_queue_xmit(skb2, cache, psend);
+		} else {
+			ipmr_queue_xmit(skb, cache, psend);
+			return 0;
+		}
+	}
+
+dont_forward:
+	if (!local)
+		kfree_skb(skb);
+	return 0;
+}
+
+
+/*
+ *	Multicast packets for forwarding arrive here
+ */
+
+int ip_mr_input(struct sk_buff *skb)
+{
+	struct mfc_cache *cache;
+	int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
+
+	/* Packet is looped back after forward, it should not be
+	   forwarded second time, but still can be delivered locally.
+	 */
+	if (IPCB(skb)->flags&IPSKB_FORWARDED)
+		goto dont_forward;
+
+	if (!local) {
+		    if (IPCB(skb)->opt.router_alert) {
+			    if (ip_call_ra_chain(skb))
+				    return 0;
+		    } else if (skb->nh.iph->protocol == IPPROTO_IGMP){
+			    /* IGMPv1 (and broken IGMPv2 implementations sort of
+			       Cisco IOS <= 11.2(8)) do not put router alert
+			       option to IGMP packets destined to routable
+			       groups. It is very bad, because it means
+			       that we can forward NO IGMP messages.
+			     */
+			    read_lock(&mrt_lock);
+			    if (mroute_socket) {
+				    raw_rcv(mroute_socket, skb);
+				    read_unlock(&mrt_lock);
+				    return 0;
+			    }
+			    read_unlock(&mrt_lock);
+		    }
+	}
+
+	read_lock(&mrt_lock);
+	cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr);
+
+	/*
+	 *	No usable cache entry
+	 */
+	if (cache==NULL) {
+		int vif;
+
+		if (local) {
+			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+			ip_local_deliver(skb);
+			if (skb2 == NULL) {
+				read_unlock(&mrt_lock);
+				return -ENOBUFS;
+			}
+			skb = skb2;
+		}
+
+		vif = ipmr_find_vif(skb->dev);
+		if (vif >= 0) {
+			int err = ipmr_cache_unresolved(vif, skb);
+			read_unlock(&mrt_lock);
+
+			return err;
+		}
+		read_unlock(&mrt_lock);
+		kfree_skb(skb);
+		return -ENODEV;
+	}
+
+	ip_mr_forward(skb, cache, local);
+
+	read_unlock(&mrt_lock);
+
+	if (local)
+		return ip_local_deliver(skb);
+
+	return 0;
+
+dont_forward:
+	if (local)
+		return ip_local_deliver(skb);
+	kfree_skb(skb);
+	return 0;
+}
+
+#ifdef CONFIG_IP_PIMSM_V1
+/*
+ * Handle IGMP messages of PIMv1
+ */
+
+int pim_rcv_v1(struct sk_buff * skb)
+{
+	struct igmphdr *pim;
+	struct iphdr   *encap;
+	struct net_device  *reg_dev = NULL;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
+		goto drop;
+
+	pim = (struct igmphdr*)skb->h.raw;
+
+        if (!mroute_do_pim ||
+	    skb->len < sizeof(*pim) + sizeof(*encap) ||
+	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) 
+		goto drop;
+
+	encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr));
+	/*
+	   Check that:
+	   a. packet is really destinted to a multicast group
+	   b. packet is not a NULL-REGISTER
+	   c. packet is not truncated
+	 */
+	if (!MULTICAST(encap->daddr) ||
+	    encap->tot_len == 0 ||
+	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
+		goto drop;
+
+	read_lock(&mrt_lock);
+	if (reg_vif_num >= 0)
+		reg_dev = vif_table[reg_vif_num].dev;
+	if (reg_dev)
+		dev_hold(reg_dev);
+	read_unlock(&mrt_lock);
+
+	if (reg_dev == NULL) 
+		goto drop;
+
+	skb->mac.raw = skb->nh.raw;
+	skb_pull(skb, (u8*)encap - skb->data);
+	skb->nh.iph = (struct iphdr *)skb->data;
+	skb->dev = reg_dev;
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	skb->protocol = htons(ETH_P_IP);
+	skb->ip_summed = 0;
+	skb->pkt_type = PACKET_HOST;
+	dst_release(skb->dst);
+	skb->dst = NULL;
+	((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+	((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+	nf_reset(skb);
+	netif_rx(skb);
+	dev_put(reg_dev);
+	return 0;
+ drop:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_IP_PIMSM_V2
+static int pim_rcv(struct sk_buff * skb)
+{
+	struct pimreghdr *pim;
+	struct iphdr   *encap;
+	struct net_device  *reg_dev = NULL;
+
+	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) 
+		goto drop;
+
+	pim = (struct pimreghdr*)skb->h.raw;
+        if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
+	    (pim->flags&PIM_NULL_REGISTER) ||
+	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && 
+	     (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) 
+		goto drop;
+
+	/* check if the inner packet is destined to mcast group */
+	encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr));
+	if (!MULTICAST(encap->daddr) ||
+	    encap->tot_len == 0 ||
+	    ntohs(encap->tot_len) + sizeof(*pim) > skb->len) 
+		goto drop;
+
+	read_lock(&mrt_lock);
+	if (reg_vif_num >= 0)
+		reg_dev = vif_table[reg_vif_num].dev;
+	if (reg_dev)
+		dev_hold(reg_dev);
+	read_unlock(&mrt_lock);
+
+	if (reg_dev == NULL) 
+		goto drop;
+
+	skb->mac.raw = skb->nh.raw;
+	skb_pull(skb, (u8*)encap - skb->data);
+	skb->nh.iph = (struct iphdr *)skb->data;
+	skb->dev = reg_dev;
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+	skb->protocol = htons(ETH_P_IP);
+	skb->ip_summed = 0;
+	skb->pkt_type = PACKET_HOST;
+	dst_release(skb->dst);
+	((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len;
+	((struct net_device_stats*)reg_dev->priv)->rx_packets++;
+	skb->dst = NULL;
+	nf_reset(skb);
+	netif_rx(skb);
+	dev_put(reg_dev);
+	return 0;
+ drop:
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
+static int
+ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
+{
+	int ct;
+	struct rtnexthop *nhp;
+	struct net_device *dev = vif_table[c->mfc_parent].dev;
+	u8 *b = skb->tail;
+	struct rtattr *mp_head;
+
+	if (dev)
+		RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+
+	mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
+
+	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
+		if (c->mfc_un.res.ttls[ct] < 255) {
+			if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
+				goto rtattr_failure;
+			nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
+			nhp->rtnh_flags = 0;
+			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
+			nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
+			nhp->rtnh_len = sizeof(*nhp);
+		}
+	}
+	mp_head->rta_type = RTA_MULTIPATH;
+	mp_head->rta_len = skb->tail - (u8*)mp_head;
+	rtm->rtm_type = RTN_MULTICAST;
+	return 1;
+
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -EMSGSIZE;
+}
+
+int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
+{
+	int err;
+	struct mfc_cache *cache;
+	struct rtable *rt = (struct rtable*)skb->dst;
+
+	read_lock(&mrt_lock);
+	cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
+
+	if (cache==NULL) {
+		struct net_device *dev;
+		int vif;
+
+		if (nowait) {
+			read_unlock(&mrt_lock);
+			return -EAGAIN;
+		}
+
+		dev = skb->dev;
+		if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
+			read_unlock(&mrt_lock);
+			return -ENODEV;
+		}
+		skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+		skb->nh.iph->ihl = sizeof(struct iphdr)>>2;
+		skb->nh.iph->saddr = rt->rt_src;
+		skb->nh.iph->daddr = rt->rt_dst;
+		skb->nh.iph->version = 0;
+		err = ipmr_cache_unresolved(vif, skb);
+		read_unlock(&mrt_lock);
+		return err;
+	}
+
+	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+		cache->mfc_flags |= MFC_NOTIFY;
+	err = ipmr_fill_mroute(skb, cache, rtm);
+	read_unlock(&mrt_lock);
+	return err;
+}
+
+#ifdef CONFIG_PROC_FS	
+/*
+ *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ */
+struct ipmr_vif_iter {
+	int ct;
+};
+
+static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
+					   loff_t pos)
+{
+	for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
+		if(!VIF_EXISTS(iter->ct))
+			continue;
+		if (pos-- == 0) 
+			return &vif_table[iter->ct];
+	}
+	return NULL;
+}
+
+static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock(&mrt_lock);
+	return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) 
+		: SEQ_START_TOKEN;
+}
+
+static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ipmr_vif_iter *iter = seq->private;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ipmr_vif_seq_idx(iter, 0);
+	
+	while (++iter->ct < maxvif) {
+		if(!VIF_EXISTS(iter->ct))
+			continue;
+		return &vif_table[iter->ct];
+	}
+	return NULL;
+}
+
+static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock(&mrt_lock);
+}
+
+static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, 
+			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
+	} else {
+		const struct vif_device *vif = v;
+		const char *name =  vif->dev ? vif->dev->name : "none";
+
+		seq_printf(seq,
+			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
+			   vif - vif_table,
+			   name, vif->bytes_in, vif->pkt_in, 
+			   vif->bytes_out, vif->pkt_out,
+			   vif->flags, vif->local, vif->remote);
+	}
+	return 0;
+}
+
+static struct seq_operations ipmr_vif_seq_ops = {
+	.start = ipmr_vif_seq_start,
+	.next  = ipmr_vif_seq_next,
+	.stop  = ipmr_vif_seq_stop,
+	.show  = ipmr_vif_seq_show,
+};
+
+static int ipmr_vif_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+	if (!s)
+		goto out;
+
+	rc = seq_open(file, &ipmr_vif_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	s->ct = 0;
+	seq = file->private_data;
+	seq->private = s;
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+
+}
+
+static struct file_operations ipmr_vif_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ipmr_vif_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+struct ipmr_mfc_iter {
+	struct mfc_cache **cache;
+	int ct;
+};
+
+
+static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
+{
+	struct mfc_cache *mfc;
+
+	it->cache = mfc_cache_array;
+	read_lock(&mrt_lock);
+	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) 
+		for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) 
+			if (pos-- == 0) 
+				return mfc;
+	read_unlock(&mrt_lock);
+
+	it->cache = &mfc_unres_queue;
+	spin_lock_bh(&mfc_unres_lock);
+	for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) 
+		if (pos-- == 0)
+			return mfc;
+	spin_unlock_bh(&mfc_unres_lock);
+
+	it->cache = NULL;
+	return NULL;
+}
+
+
+static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct ipmr_mfc_iter *it = seq->private;
+	it->cache = NULL;
+	it->ct = 0;
+	return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) 
+		: SEQ_START_TOKEN;
+}
+
+static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct mfc_cache *mfc = v;
+	struct ipmr_mfc_iter *it = seq->private;
+
+	++*pos;
+
+	if (v == SEQ_START_TOKEN)
+		return ipmr_mfc_seq_idx(seq->private, 0);
+
+	if (mfc->next)
+		return mfc->next;
+	
+	if (it->cache == &mfc_unres_queue) 
+		goto end_of_list;
+
+	BUG_ON(it->cache != mfc_cache_array);
+
+	while (++it->ct < MFC_LINES) {
+		mfc = mfc_cache_array[it->ct];
+		if (mfc)
+			return mfc;
+	}
+
+	/* exhausted cache_array, show unresolved */
+	read_unlock(&mrt_lock);
+	it->cache = &mfc_unres_queue;
+	it->ct = 0;
+		
+	spin_lock_bh(&mfc_unres_lock);
+	mfc = mfc_unres_queue;
+	if (mfc) 
+		return mfc;
+
+ end_of_list:
+	spin_unlock_bh(&mfc_unres_lock);
+	it->cache = NULL;
+
+	return NULL;
+}
+
+static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
+{
+	struct ipmr_mfc_iter *it = seq->private;
+
+	if (it->cache == &mfc_unres_queue)
+		spin_unlock_bh(&mfc_unres_lock);
+	else if (it->cache == mfc_cache_array)
+		read_unlock(&mrt_lock);
+}
+
+static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
+{
+	int n;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(seq, 
+		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
+	} else {
+		const struct mfc_cache *mfc = v;
+		const struct ipmr_mfc_iter *it = seq->private;
+		
+		seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
+			   (unsigned long) mfc->mfc_mcastgrp,
+			   (unsigned long) mfc->mfc_origin,
+			   mfc->mfc_parent,
+			   mfc->mfc_un.res.pkt,
+			   mfc->mfc_un.res.bytes,
+			   mfc->mfc_un.res.wrong_if);
+
+		if (it->cache != &mfc_unres_queue) {
+			for(n = mfc->mfc_un.res.minvif; 
+			    n < mfc->mfc_un.res.maxvif; n++ ) {
+				if(VIF_EXISTS(n) 
+				   && mfc->mfc_un.res.ttls[n] < 255)
+				seq_printf(seq, 
+					   " %2d:%-3d", 
+					   n, mfc->mfc_un.res.ttls[n]);
+			}
+		}
+		seq_putc(seq, '\n');
+	}
+	return 0;
+}
+
+static struct seq_operations ipmr_mfc_seq_ops = {
+	.start = ipmr_mfc_seq_start,
+	.next  = ipmr_mfc_seq_next,
+	.stop  = ipmr_mfc_seq_stop,
+	.show  = ipmr_mfc_seq_show,
+};
+
+static int ipmr_mfc_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+       
+	if (!s)
+		goto out;
+
+	rc = seq_open(file, &ipmr_mfc_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq = file->private_data;
+	seq->private = s;
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+
+}
+
+static struct file_operations ipmr_mfc_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ipmr_mfc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+#endif	
+
+#ifdef CONFIG_IP_PIMSM_V2
+static struct net_protocol pim_protocol = {
+	.handler	=	pim_rcv,
+};
+#endif
+
+
+/*
+ *	Setup for IP multicast routing
+ */
+ 
+void __init ip_mr_init(void)
+{
+	mrt_cachep = kmem_cache_create("ip_mrt_cache",
+				       sizeof(struct mfc_cache),
+				       0, SLAB_HWCACHE_ALIGN,
+				       NULL, NULL);
+	if (!mrt_cachep)
+		panic("cannot allocate ip_mrt_cache");
+
+	init_timer(&ipmr_expire_timer);
+	ipmr_expire_timer.function=ipmr_expire_process;
+	register_netdevice_notifier(&ip_mr_notifier);
+#ifdef CONFIG_PROC_FS	
+	proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
+	proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
+#endif	
+}
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig
new file mode 100644
index 000000000000..63a82b4b64bb
--- /dev/null
+++ b/net/ipv4/ipvs/Kconfig
@@ -0,0 +1,244 @@
+#
+# IP Virtual Server configuration
+#
+menu	"IP: Virtual Server Configuration"
+	depends on INET && NETFILTER
+
+config	IP_VS
+	tristate "IP virtual server support (EXPERIMENTAL)"
+	depends on INET && NETFILTER
+	---help---
+	  IP Virtual Server support will let you build a high-performance
+	  virtual server based on cluster of two or more real servers. This
+	  option must be enabled for at least one of the clustered computers
+	  that will take care of intercepting incoming connections to a
+	  single IP address and scheduling them to real servers.
+
+	  Three request dispatching techniques are implemented, they are
+	  virtual server via NAT, virtual server via tunneling and virtual
+	  server via direct routing. The several scheduling algorithms can
+	  be used to choose which server the connection is directed to,
+	  thus load balancing can be achieved among the servers.  For more
+	  information and its administration program, please visit the
+	  following URL: <http://www.linuxvirtualserver.org/>.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_DEBUG
+	bool "IP virtual server debugging"
+	depends on IP_VS
+	---help---
+	  Say Y here if you want to get additional messages useful in
+	  debugging the IP virtual server code. You can change the debug
+	  level in /proc/sys/net/ipv4/vs/debug_level
+
+config	IP_VS_TAB_BITS
+	int "IPVS connection table size (the Nth power of 2)"
+	depends on IP_VS 
+	default "12" 
+	---help---
+	  The IPVS connection hash table uses the chaining scheme to handle
+	  hash collisions. Using a big IPVS connection hash table will greatly
+	  reduce conflicts when there are hundreds of thousands of connections
+	  in the hash table.
+
+	  Note the table size must be power of 2. The table size will be the
+	  value of 2 to the your input number power. The number to choose is
+	  from 8 to 20, the default number is 12, which means the table size
+	  is 4096. Don't input the number too small, otherwise you will lose
+	  performance on it. You can adapt the table size yourself, according
+	  to your virtual server application. It is good to set the table size
+	  not far less than the number of connections per second multiplying
+	  average lasting time of connection in the table.  For example, your
+	  virtual server gets 200 connections per second, the connection lasts
+	  for 200 seconds in average in the connection table, the table size
+	  should be not far less than 200x200, it is good to set the table
+	  size 32768 (2**15).
+
+	  Another note that each connection occupies 128 bytes effectively and
+	  each hash entry uses 8 bytes, so you can estimate how much memory is
+	  needed for your box.
+
+comment "IPVS transport protocol load balancing support"
+        depends on IP_VS
+
+config	IP_VS_PROTO_TCP
+	bool "TCP load balancing support"
+	depends on IP_VS
+	---help---
+	  This option enables support for load balancing TCP transport
+	  protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_UDP
+	bool "UDP load balancing support"
+	depends on IP_VS
+	---help---
+	  This option enables support for load balancing UDP transport
+	  protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_ESP
+	bool "ESP load balancing support"
+	depends on IP_VS
+	---help---
+	  This option enables support for load balancing ESP (Encapsultion
+	  Security Payload) transport protocol. Say Y if unsure.
+
+config	IP_VS_PROTO_AH
+	bool "AH load balancing support"
+	depends on IP_VS
+	---help---
+	  This option enables support for load balancing AH (Authentication
+	  Header) transport protocol. Say Y if unsure.
+
+comment "IPVS scheduler"
+        depends on IP_VS
+
+config	IP_VS_RR
+	tristate "round-robin scheduling"
+	depends on IP_VS
+	---help---
+	  The robin-robin scheduling algorithm simply directs network
+	  connections to different real servers in a round-robin manner.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+ 
+config	IP_VS_WRR
+        tristate "weighted round-robin scheduling" 
+	depends on IP_VS
+	---help---
+	  The weighted robin-robin scheduling algorithm directs network
+	  connections to different real servers based on server weights
+	  in a round-robin manner. Servers with higher weights receive
+	  new connections first than those with less weights, and servers
+	  with higher weights get more connections than those with less
+	  weights and servers with equal weights get equal connections.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_LC
+        tristate "least-connection scheduling"
+        depends on IP_VS
+	---help---
+	  The least-connection scheduling algorithm directs network
+	  connections to the server with the least number of active 
+	  connections.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_WLC
+        tristate "weighted least-connection scheduling"
+        depends on IP_VS
+	---help---
+	  The weighted least-connection scheduling algorithm directs network
+	  connections to the server with the least active connections
+	  normalized by the server weight.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_LBLC
+	tristate "locality-based least-connection scheduling"
+        depends on IP_VS
+	---help---
+	  The locality-based least-connection scheduling algorithm is for
+	  destination IP load balancing. It is usually used in cache cluster.
+	  This algorithm usually directs packet destined for an IP address to
+	  its server if the server is alive and under load. If the server is
+	  overloaded (its active connection numbers is larger than its weight)
+	  and there is a server in its half load, then allocate the weighted
+	  least-connection server to this IP address.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config  IP_VS_LBLCR
+	tristate "locality-based least-connection with replication scheduling"
+        depends on IP_VS
+	---help---
+	  The locality-based least-connection with replication scheduling
+	  algorithm is also for destination IP load balancing. It is 
+	  usually used in cache cluster. It differs from the LBLC scheduling
+	  as follows: the load balancer maintains mappings from a target
+	  to a set of server nodes that can serve the target. Requests for
+	  a target are assigned to the least-connection node in the target's
+	  server set. If all the node in the server set are over loaded,
+	  it picks up a least-connection node in the cluster and adds it
+	  in the sever set for the target. If the server set has not been
+	  modified for the specified time, the most loaded node is removed
+	  from the server set, in order to avoid high degree of replication.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_DH
+	tristate "destination hashing scheduling"
+        depends on IP_VS
+	---help---
+	  The destination hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their destination IP addresses.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_SH
+	tristate "source hashing scheduling"
+        depends on IP_VS
+	---help---
+	  The source hashing scheduling algorithm assigns network
+	  connections to the servers through looking up a statically assigned
+	  hash table by their source IP addresses.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_SED
+	tristate "shortest expected delay scheduling"
+        depends on IP_VS
+	---help---
+	  The shortest expected delay scheduling algorithm assigns network
+	  connections to the server with the shortest expected delay. The 
+	  expected delay that the job will experience is (Ci + 1) / Ui if 
+	  sent to the ith server, in which Ci is the number of connections
+	  on the the ith server and Ui is the fixed service rate (weight)
+	  of the ith server.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+config	IP_VS_NQ
+	tristate "never queue scheduling"
+        depends on IP_VS
+	---help---
+	  The never queue scheduling algorithm adopts a two-speed model.
+	  When there is an idle server available, the job will be sent to
+	  the idle server, instead of waiting for a fast one. When there
+	  is no idle server available, the job will be sent to the server
+	  that minimize its expected delay (The Shortest Expected Delay
+	  scheduling algorithm).
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+comment 'IPVS application helper'
+	depends on IP_VS
+
+config	IP_VS_FTP
+  	tristate "FTP protocol helper"
+        depends on IP_VS && IP_VS_PROTO_TCP
+	---help---
+	  FTP is a protocol that transfers IP address and/or port number in
+	  the payload. In the virtual server via Network Address Translation,
+	  the IP address and port number of real servers cannot be sent to
+	  clients in ftp connections directly, so FTP protocol helper is
+	  required for tracking the connection and mangling it back to that of
+	  virtual service.
+
+	  If you want to compile it in kernel, say Y. To compile it as a
+	  module, choose M here. If unsure, say N.
+
+endmenu
diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile
new file mode 100644
index 000000000000..a788461a40c9
--- /dev/null
+++ b/net/ipv4/ipvs/Makefile
@@ -0,0 +1,34 @@
+#
+# Makefile for the IPVS modules on top of IPv4.
+#
+
+# IPVS transport protocol load balancing support
+ip_vs_proto-objs-y :=
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o
+ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o
+
+ip_vs-objs :=	ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o	   \
+		ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o	   		   \
+		ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o		   \
+		$(ip_vs_proto-objs-y)
+
+
+# IPVS core
+obj-$(CONFIG_IP_VS) += ip_vs.o
+
+# IPVS schedulers
+obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o
+obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o
+obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o
+obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o
+obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
+obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
+obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
+obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
+obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
+
+# IPVS application helpers
+obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
new file mode 100644
index 000000000000..d9212addd193
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -0,0 +1,658 @@
+/*
+ * ip_vs_app.c: Application module support for IPVS
+ *
+ * Version:     $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference
+ * is that ip_vs_app module handles the reverse direction (incoming requests
+ * and outgoing responses).
+ *
+ *		IP_MASQ_APP application masquerading module
+ *
+ * Author:	Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <net/ip_vs.h>
+
+EXPORT_SYMBOL(register_ip_vs_app);
+EXPORT_SYMBOL(unregister_ip_vs_app);
+EXPORT_SYMBOL(register_ip_vs_app_inc);
+
+/* ipvs application list head */
+static LIST_HEAD(ip_vs_app_list);
+static DECLARE_MUTEX(__ip_vs_app_mutex);
+
+
+/*
+ *	Get an ip_vs_app object
+ */
+static inline int ip_vs_app_get(struct ip_vs_app *app)
+{
+	/* test and get the module atomically */
+	if (app->module)
+		return try_module_get(app->module);
+	else
+		return 1;
+}
+
+
+static inline void ip_vs_app_put(struct ip_vs_app *app)
+{
+	if (app->module)
+		module_put(app->module);
+}
+
+
+/*
+ *	Allocate/initialize app incarnation and register it in proto apps.
+ */
+static int
+ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+	struct ip_vs_protocol *pp;
+	struct ip_vs_app *inc;
+	int ret;
+
+	if (!(pp = ip_vs_proto_get(proto)))
+		return -EPROTONOSUPPORT;
+
+	if (!pp->unregister_app)
+		return -EOPNOTSUPP;
+
+	inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL);
+	if (!inc)
+		return -ENOMEM;
+	memcpy(inc, app, sizeof(*inc));
+	INIT_LIST_HEAD(&inc->p_list);
+	INIT_LIST_HEAD(&inc->incs_list);
+	inc->app = app;
+	inc->port = htons(port);
+	atomic_set(&inc->usecnt, 0);
+
+	if (app->timeouts) {
+		inc->timeout_table =
+			ip_vs_create_timeout_table(app->timeouts,
+						   app->timeouts_size);
+		if (!inc->timeout_table) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+
+	ret = pp->register_app(inc);
+	if (ret)
+		goto out;
+
+	list_add(&inc->a_list, &app->incs_list);
+	IP_VS_DBG(9, "%s application %s:%u registered\n",
+		  pp->name, inc->name, inc->port);
+
+	return 0;
+
+  out:
+	if (inc->timeout_table)
+		kfree(inc->timeout_table);
+	kfree(inc);
+	return ret;
+}
+
+
+/*
+ *	Release app incarnation
+ */
+static void
+ip_vs_app_inc_release(struct ip_vs_app *inc)
+{
+	struct ip_vs_protocol *pp;
+
+	if (!(pp = ip_vs_proto_get(inc->protocol)))
+		return;
+
+	if (pp->unregister_app)
+		pp->unregister_app(inc);
+
+	IP_VS_DBG(9, "%s App %s:%u unregistered\n",
+		  pp->name, inc->name, inc->port);
+
+	list_del(&inc->a_list);
+
+	if (inc->timeout_table != NULL)
+		kfree(inc->timeout_table);
+	kfree(inc);
+}
+
+
+/*
+ *	Get reference to app inc (only called from softirq)
+ *
+ */
+int ip_vs_app_inc_get(struct ip_vs_app *inc)
+{
+	int result;
+
+	atomic_inc(&inc->usecnt);
+	if (unlikely((result = ip_vs_app_get(inc->app)) != 1))
+		atomic_dec(&inc->usecnt);
+	return result;
+}
+
+
+/*
+ *	Put the app inc (only called from timer or net softirq)
+ */
+void ip_vs_app_inc_put(struct ip_vs_app *inc)
+{
+	ip_vs_app_put(inc->app);
+	atomic_dec(&inc->usecnt);
+}
+
+
+/*
+ *	Register an application incarnation in protocol applications
+ */
+int
+register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
+{
+	int result;
+
+	down(&__ip_vs_app_mutex);
+
+	result = ip_vs_app_inc_new(app, proto, port);
+
+	up(&__ip_vs_app_mutex);
+
+	return result;
+}
+
+
+/*
+ *	ip_vs_app registration routine
+ */
+int register_ip_vs_app(struct ip_vs_app *app)
+{
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	down(&__ip_vs_app_mutex);
+
+	list_add(&app->a_list, &ip_vs_app_list);
+
+	up(&__ip_vs_app_mutex);
+
+	return 0;
+}
+
+
+/*
+ *	ip_vs_app unregistration routine
+ *	We are sure there are no app incarnations attached to services
+ */
+void unregister_ip_vs_app(struct ip_vs_app *app)
+{
+	struct ip_vs_app *inc, *nxt;
+
+	down(&__ip_vs_app_mutex);
+
+	list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
+		ip_vs_app_inc_release(inc);
+	}
+
+	list_del(&app->a_list);
+
+	up(&__ip_vs_app_mutex);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+}
+
+
+#if 0000
+/*
+ *	Get reference to app by name (called from user context)
+ */
+struct ip_vs_app *ip_vs_app_get_by_name(char *appname)
+{
+	struct ip_vs_app *app, *a = NULL;
+
+	down(&__ip_vs_app_mutex);
+
+	list_for_each_entry(ent, &ip_vs_app_list, a_list) {
+		if (strcmp(app->name, appname))
+			continue;
+
+		/* softirq may call ip_vs_app_get too, so the caller
+		   must disable softirq on the current CPU */
+		if (ip_vs_app_get(app))
+			a = app;
+		break;
+	}
+
+	up(&__ip_vs_app_mutex);
+
+	return a;
+}
+#endif
+
+
+/*
+ *	Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
+ */
+int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp)
+{
+	return pp->app_conn_bind(cp);
+}
+
+
+/*
+ *	Unbind cp from application incarnation (called by cp destructor)
+ */
+void ip_vs_unbind_app(struct ip_vs_conn *cp)
+{
+	struct ip_vs_app *inc = cp->app;
+
+	if (!inc)
+		return;
+
+	if (inc->unbind_conn)
+		inc->unbind_conn(inc, cp);
+	if (inc->done_conn)
+		inc->done_conn(inc, cp);
+	ip_vs_app_inc_put(inc);
+	cp->app = NULL;
+}
+
+
+/*
+ *	Fixes th->seq based on ip_vs_seq info.
+ */
+static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+	__u32 seq = ntohl(th->seq);
+
+	/*
+	 *	Adjust seq with delta-offset for all packets after
+	 *	the most recent resized pkt seq and with previous_delta offset
+	 *	for all packets	before most recent resized pkt seq.
+	 */
+	if (vseq->delta || vseq->previous_delta) {
+		if(after(seq, vseq->init_seq)) {
+			th->seq = htonl(seq + vseq->delta);
+			IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n",
+				  vseq->delta);
+		} else {
+			th->seq = htonl(seq + vseq->previous_delta);
+			IP_VS_DBG(9, "vs_fix_seq(): added previous_delta "
+				  "(%d) to seq\n", vseq->previous_delta);
+		}
+	}
+}
+
+
+/*
+ *	Fixes th->ack_seq based on ip_vs_seq info.
+ */
+static inline void
+vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th)
+{
+	__u32 ack_seq = ntohl(th->ack_seq);
+
+	/*
+	 * Adjust ack_seq with delta-offset for
+	 * the packets AFTER most recent resized pkt has caused a shift
+	 * for packets before most recent resized pkt, use previous_delta
+	 */
+	if (vseq->delta || vseq->previous_delta) {
+		/* since ack_seq is the number of octet that is expected
+		   to receive next, so compare it with init_seq+delta */
+		if(after(ack_seq, vseq->init_seq+vseq->delta)) {
+			th->ack_seq = htonl(ack_seq - vseq->delta);
+			IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta "
+				  "(%d) from ack_seq\n", vseq->delta);
+
+		} else {
+			th->ack_seq = htonl(ack_seq - vseq->previous_delta);
+			IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted "
+				  "previous_delta (%d) from ack_seq\n",
+				  vseq->previous_delta);
+		}
+	}
+}
+
+
+/*
+ *	Updates ip_vs_seq if pkt has been resized
+ *	Assumes already checked proto==IPPROTO_TCP and diff!=0.
+ */
+static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
+				 unsigned flag, __u32 seq, int diff)
+{
+	/* spinlock is to keep updating cp->flags atomic */
+	spin_lock(&cp->lock);
+	if (!(cp->flags & flag) || after(seq, vseq->init_seq)) {
+		vseq->previous_delta = vseq->delta;
+		vseq->delta += diff;
+		vseq->init_seq = seq;
+		cp->flags |= flag;
+	}
+	spin_unlock(&cp->lock);
+}
+
+static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
+				  struct ip_vs_app *app)
+{
+	int diff;
+	unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+	struct tcphdr *th;
+	__u32 seq;
+
+	if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+		return 0;
+
+	th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+
+	/*
+	 *	Remember seq number in case this pkt gets resized
+	 */
+	seq = ntohl(th->seq);
+
+	/*
+	 *	Fix seq stuff if flagged as so.
+	 */
+	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+		vs_fix_seq(&cp->out_seq, th);
+	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+		vs_fix_ack_seq(&cp->in_seq, th);
+
+	/*
+	 *	Call private output hook function
+	 */
+	if (app->pkt_out == NULL)
+		return 1;
+
+	if (!app->pkt_out(app, cp, pskb, &diff))
+		return 0;
+
+	/*
+	 *	Update ip_vs seq stuff if len has changed.
+	 */
+	if (diff != 0)
+		vs_seq_update(cp, &cp->out_seq,
+			      IP_VS_CONN_F_OUT_SEQ, seq, diff);
+
+	return 1;
+}
+
+/*
+ *	Output pkt hook. Will call bound ip_vs_app specific function
+ *	called by ipvs packet handler, assumes previously checked cp!=NULL
+ *	returns false if it can't handle packet (oom)
+ */
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+	struct ip_vs_app *app;
+
+	/*
+	 *	check if application module is bound to
+	 *	this ip_vs_conn.
+	 */
+	if ((app = cp->app) == NULL)
+		return 1;
+
+	/* TCP is complicated */
+	if (cp->protocol == IPPROTO_TCP)
+		return app_tcp_pkt_out(cp, pskb, app);
+
+	/*
+	 *	Call private output hook function
+	 */
+	if (app->pkt_out == NULL)
+		return 1;
+
+	return app->pkt_out(app, cp, pskb, NULL);
+}
+
+
+static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
+				 struct ip_vs_app *app)
+{
+	int diff;
+	unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
+	struct tcphdr *th;
+	__u32 seq;
+
+	if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
+		return 0;
+
+	th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
+
+	/*
+	 *	Remember seq number in case this pkt gets resized
+	 */
+	seq = ntohl(th->seq);
+
+	/*
+	 *	Fix seq stuff if flagged as so.
+	 */
+	if (cp->flags & IP_VS_CONN_F_IN_SEQ)
+		vs_fix_seq(&cp->in_seq, th);
+	if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
+		vs_fix_ack_seq(&cp->out_seq, th);
+
+	/*
+	 *	Call private input hook function
+	 */
+	if (app->pkt_in == NULL)
+		return 1;
+
+	if (!app->pkt_in(app, cp, pskb, &diff))
+		return 0;
+
+	/*
+	 *	Update ip_vs seq stuff if len has changed.
+	 */
+	if (diff != 0)
+		vs_seq_update(cp, &cp->in_seq,
+			      IP_VS_CONN_F_IN_SEQ, seq, diff);
+
+	return 1;
+}
+
+/*
+ *	Input pkt hook. Will call bound ip_vs_app specific function
+ *	called by ipvs packet handler, assumes previously checked cp!=NULL.
+ *	returns false if can't handle packet (oom).
+ */
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
+{
+	struct ip_vs_app *app;
+
+	/*
+	 *	check if application module is bound to
+	 *	this ip_vs_conn.
+	 */
+	if ((app = cp->app) == NULL)
+		return 1;
+
+	/* TCP is complicated */
+	if (cp->protocol == IPPROTO_TCP)
+		return app_tcp_pkt_in(cp, pskb, app);
+
+	/*
+	 *	Call private input hook function
+	 */
+	if (app->pkt_in == NULL)
+		return 1;
+
+	return app->pkt_in(app, cp, pskb, NULL);
+}
+
+
+#ifdef CONFIG_PROC_FS
+/*
+ *	/proc/net/ip_vs_app entry function
+ */
+
+static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
+{
+	struct ip_vs_app *app, *inc;
+
+	list_for_each_entry(app, &ip_vs_app_list, a_list) {
+		list_for_each_entry(inc, &app->incs_list, a_list) {
+			if (pos-- == 0)
+				return inc;
+		}
+	}
+	return NULL;
+
+}
+
+static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	down(&__ip_vs_app_mutex);
+
+	return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_vs_app *inc, *app;
+	struct list_head *e;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_app_idx(0);
+
+	inc = v;
+	app = inc->app;
+
+	if ((e = inc->a_list.next) != &app->incs_list)
+		return list_entry(e, struct ip_vs_app, a_list);
+
+	/* go on to next application */
+	for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) {
+		app = list_entry(e, struct ip_vs_app, a_list);
+		list_for_each_entry(inc, &app->incs_list, a_list) {
+			return inc;
+		}
+	}
+	return NULL;
+}
+
+static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
+{
+	up(&__ip_vs_app_mutex);
+}
+
+static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq, "prot port    usecnt name\n");
+	else {
+		const struct ip_vs_app *inc = v;
+
+		seq_printf(seq, "%-3s  %-7u %-6d %-17s\n",
+			   ip_vs_proto_name(inc->protocol),
+			   ntohs(inc->port),
+			   atomic_read(&inc->usecnt),
+			   inc->name);
+	}
+	return 0;
+}
+
+static struct seq_operations ip_vs_app_seq_ops = {
+	.start = ip_vs_app_seq_start,
+	.next  = ip_vs_app_seq_next,
+	.stop  = ip_vs_app_seq_stop,
+	.show  = ip_vs_app_seq_show,
+};
+
+static int ip_vs_app_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ip_vs_app_seq_ops);
+}
+
+static struct file_operations ip_vs_app_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ip_vs_app_open,
+	.read	 = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+#endif
+
+
+/*
+ *	Replace a segment of data with a new segment
+ */
+int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+		      char *o_buf, int o_len, char *n_buf, int n_len)
+{
+	struct iphdr *iph;
+	int diff;
+	int o_offset;
+	int o_left;
+
+	EnterFunction(9);
+
+	diff = n_len - o_len;
+	o_offset = o_buf - (char *)skb->data;
+	/* The length of left data after o_buf+o_len in the skb data */
+	o_left = skb->len - (o_offset + o_len);
+
+	if (diff <= 0) {
+		memmove(o_buf + n_len, o_buf + o_len, o_left);
+		memcpy(o_buf, n_buf, n_len);
+		skb_trim(skb, skb->len + diff);
+	} else if (diff <= skb_tailroom(skb)) {
+		skb_put(skb, diff);
+		memmove(o_buf + n_len, o_buf + o_len, o_left);
+		memcpy(o_buf, n_buf, n_len);
+	} else {
+		if (pskb_expand_head(skb, skb_headroom(skb), diff, pri))
+			return -ENOMEM;
+		skb_put(skb, diff);
+		memmove(skb->data + o_offset + n_len,
+			skb->data + o_offset + o_len, o_left);
+		memcpy(skb->data + o_offset, n_buf, n_len);
+	}
+
+	/* must update the iph total length here */
+	iph = skb->nh.iph;
+	iph->tot_len = htons(skb->len);
+
+	LeaveFunction(9);
+	return 0;
+}
+
+
+int ip_vs_app_init(void)
+{
+	/* we will replace it with proc_net_ipvs_create() soon */
+	proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops);
+	return 0;
+}
+
+
+void ip_vs_app_cleanup(void)
+{
+	proc_net_remove("ip_vs_app");
+}
diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c
new file mode 100644
index 000000000000..fd6feb5499fe
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_conn.c
@@ -0,0 +1,920 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others. Many code here is taken from IP MASQ code of kernel 2.2.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>		/* for proc_net_* */
+#include <linux/seq_file.h>
+#include <linux/jhash.h>
+#include <linux/random.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *  Connection hash table: for input and output packets lookups of IPVS
+ */
+static struct list_head *ip_vs_conn_tab;
+
+/*  SLAB cache for IPVS connections */
+static kmem_cache_t *ip_vs_conn_cachep;
+
+/*  counter for current IPVS connections */
+static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
+
+/*  counter for no client port connections */
+static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
+
+/* random value for IPVS connection hash */
+static unsigned int ip_vs_conn_rnd;
+
+/*
+ *  Fine locking granularity for big connection hash table
+ */
+#define CT_LOCKARRAY_BITS  4
+#define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
+#define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
+
+struct ip_vs_aligned_lock
+{
+	rwlock_t	l;
+} __attribute__((__aligned__(SMP_CACHE_BYTES)));
+
+/* lock array for conn table */
+static struct ip_vs_aligned_lock
+__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
+
+static inline void ct_read_lock(unsigned key)
+{
+	read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock(unsigned key)
+{
+	read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock(unsigned key)
+{
+	write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock(unsigned key)
+{
+	write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_lock_bh(unsigned key)
+{
+	read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_read_unlock_bh(unsigned key)
+{
+	read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_lock_bh(unsigned key)
+{
+	write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+static inline void ct_write_unlock_bh(unsigned key)
+{
+	write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
+}
+
+
+/*
+ *	Returns hash value for IPVS connection entry
+ */
+static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+	return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
+		& IP_VS_CONN_TAB_MASK;
+}
+
+
+/*
+ *	Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
+ *	returns bool success.
+ */
+static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+	int ret;
+
+	/* Hash by protocol, client address and port */
+	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+
+	ct_write_lock(hash);
+
+	if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
+		list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
+		cp->flags |= IP_VS_CONN_F_HASHED;
+		atomic_inc(&cp->refcnt);
+		ret = 1;
+	} else {
+		IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		ret = 0;
+	}
+
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
+
+/*
+ *	UNhashes ip_vs_conn from ip_vs_conn_tab.
+ *	returns bool success.
+ */
+static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
+{
+	unsigned hash;
+	int ret;
+
+	/* unhash it and decrease its reference counter */
+	hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
+
+	ct_write_lock(hash);
+
+	if (cp->flags & IP_VS_CONN_F_HASHED) {
+		list_del(&cp->c_list);
+		cp->flags &= ~IP_VS_CONN_F_HASHED;
+		atomic_dec(&cp->refcnt);
+		ret = 1;
+	} else
+		ret = 0;
+
+	ct_write_unlock(hash);
+
+	return ret;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from OUTside-to-INside.
+ *	s_addr, s_port: pkt source address (foreign host)
+ *	d_addr, d_port: pkt dest address (load balancer)
+ */
+static inline struct ip_vs_conn *__ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp;
+
+	hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
+
+	ct_read_lock(hash);
+
+	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (s_addr==cp->caddr && s_port==cp->cport &&
+		    d_port==cp->vport && d_addr==cp->vaddr &&
+		    protocol==cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ct_read_unlock(hash);
+			return cp;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	return NULL;
+}
+
+struct ip_vs_conn *ip_vs_conn_in_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+	struct ip_vs_conn *cp;
+
+	cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
+	if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
+		cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
+
+	IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+		  ip_vs_proto_name(protocol),
+		  NIPQUAD(s_addr), ntohs(s_port),
+		  NIPQUAD(d_addr), ntohs(d_port),
+		  cp?"hit":"not hit");
+
+	return cp;
+}
+
+
+/*
+ *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
+ *  Called for pkts coming from inside-to-OUTside.
+ *	s_addr, s_port: pkt source address (inside host)
+ *	d_addr, d_port: pkt dest address (foreign host)
+ */
+struct ip_vs_conn *ip_vs_conn_out_get
+(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
+{
+	unsigned hash;
+	struct ip_vs_conn *cp, *ret=NULL;
+
+	/*
+	 *	Check for "full" addressed entries
+	 */
+	hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
+
+	ct_read_lock(hash);
+
+	list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+		if (d_addr == cp->caddr && d_port == cp->cport &&
+		    s_port == cp->dport && s_addr == cp->daddr &&
+		    protocol == cp->protocol) {
+			/* HIT */
+			atomic_inc(&cp->refcnt);
+			ret = cp;
+			break;
+		}
+	}
+
+	ct_read_unlock(hash);
+
+	IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
+		  ip_vs_proto_name(protocol),
+		  NIPQUAD(s_addr), ntohs(s_port),
+		  NIPQUAD(d_addr), ntohs(d_port),
+		  ret?"hit":"not hit");
+
+	return ret;
+}
+
+
+/*
+ *      Put back the conn and restart its timer with its timeout
+ */
+void ip_vs_conn_put(struct ip_vs_conn *cp)
+{
+	/* reset it expire in its timeout */
+	mod_timer(&cp->timer, jiffies+cp->timeout);
+
+	__ip_vs_conn_put(cp);
+}
+
+
+/*
+ *	Fill a no_client_port connection with a client port number
+ */
+void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
+{
+	if (ip_vs_conn_unhash(cp)) {
+		spin_lock(&cp->lock);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
+			cp->cport = cport;
+		}
+		spin_unlock(&cp->lock);
+
+		/* hash on new dport */
+		ip_vs_conn_hash(cp);
+	}
+}
+
+
+/*
+ *	Bind a connection entry with the corresponding packet_xmit.
+ *	Called by ip_vs_conn_new.
+ */
+static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
+{
+	switch (IP_VS_FWD_METHOD(cp)) {
+	case IP_VS_CONN_F_MASQ:
+		cp->packet_xmit = ip_vs_nat_xmit;
+		break;
+
+	case IP_VS_CONN_F_TUNNEL:
+		cp->packet_xmit = ip_vs_tunnel_xmit;
+		break;
+
+	case IP_VS_CONN_F_DROUTE:
+		cp->packet_xmit = ip_vs_dr_xmit;
+		break;
+
+	case IP_VS_CONN_F_LOCALNODE:
+		cp->packet_xmit = ip_vs_null_xmit;
+		break;
+
+	case IP_VS_CONN_F_BYPASS:
+		cp->packet_xmit = ip_vs_bypass_xmit;
+		break;
+	}
+}
+
+
+static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
+{
+	return atomic_read(&dest->activeconns)
+		+ atomic_read(&dest->inactconns);
+}
+
+/*
+ *	Bind a connection entry with a virtual service destination
+ *	Called just after a new connection entry is created.
+ */
+static inline void
+ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
+{
+	/* if dest is NULL, then return directly */
+	if (!dest)
+		return;
+
+	/* Increase the refcnt counter of the dest */
+	atomic_inc(&dest->refcnt);
+
+	/* Bind with the destination and its corresponding transmitter */
+	cp->flags |= atomic_read(&dest->conn_flags);
+	cp->dest = dest;
+
+	IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+		  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+		  ip_vs_proto_name(cp->protocol),
+		  NIPQUAD(cp->caddr), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  ip_vs_fwd_tag(cp), cp->state,
+		  cp->flags, atomic_read(&cp->refcnt),
+		  atomic_read(&dest->refcnt));
+
+	/* Update the connection counters */
+	if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		/* It is a normal connection, so increase the inactive
+		   connection counter because it is in TCP SYNRECV
+		   state (inactive) or other protocol inacive state */
+		atomic_inc(&dest->inactconns);
+	} else {
+		/* It is a persistent connection/template, so increase
+		   the peristent connection counter */
+		atomic_inc(&dest->persistconns);
+	}
+
+	if (dest->u_threshold != 0 &&
+	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
+		dest->flags |= IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *	Unbind a connection entry with its VS destination
+ *	Called by the ip_vs_conn_expire function.
+ */
+static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
+{
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (!dest)
+		return;
+
+	IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+		  "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n",
+		  ip_vs_proto_name(cp->protocol),
+		  NIPQUAD(cp->caddr), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  ip_vs_fwd_tag(cp), cp->state,
+		  cp->flags, atomic_read(&cp->refcnt),
+		  atomic_read(&dest->refcnt));
+
+	/* Update the connection counters */
+	if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		/* It is a normal connection, so decrease the inactconns
+		   or activeconns counter */
+		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
+			atomic_dec(&dest->inactconns);
+		} else {
+			atomic_dec(&dest->activeconns);
+		}
+	} else {
+		/* It is a persistent connection/template, so decrease
+		   the peristent connection counter */
+		atomic_dec(&dest->persistconns);
+	}
+
+	if (dest->l_threshold != 0) {
+		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	} else if (dest->u_threshold != 0) {
+		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	} else {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	}
+
+	/*
+	 * Simply decrease the refcnt of the dest, because the
+	 * dest will be either in service's destination list
+	 * or in the trash.
+	 */
+	atomic_dec(&dest->refcnt);
+}
+
+
+/*
+ *	Checking if the destination of a connection template is available.
+ *	If available, return 1, otherwise invalidate this connection
+ *	template and return 0.
+ */
+int ip_vs_check_template(struct ip_vs_conn *ct)
+{
+	struct ip_vs_dest *dest = ct->dest;
+
+	/*
+	 * Checking the dest server status.
+	 */
+	if ((dest == NULL) ||
+	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 
+	    (sysctl_ip_vs_expire_quiescent_template && 
+	     (atomic_read(&dest->weight) == 0))) {
+		IP_VS_DBG(9, "check_template: dest not available for "
+			  "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
+			  "-> d:%u.%u.%u.%u:%d\n",
+			  ip_vs_proto_name(ct->protocol),
+			  NIPQUAD(ct->caddr), ntohs(ct->cport),
+			  NIPQUAD(ct->vaddr), ntohs(ct->vport),
+			  NIPQUAD(ct->daddr), ntohs(ct->dport));
+
+		/*
+		 * Invalidate the connection template
+		 */
+		if (ct->cport) {
+			if (ip_vs_conn_unhash(ct)) {
+				ct->dport = 65535;
+				ct->vport = 65535;
+				ct->cport = 0;
+				ip_vs_conn_hash(ct);
+			}
+		}
+
+		/*
+		 * Simply decrease the refcnt of the template,
+		 * don't restart its timer.
+		 */
+		atomic_dec(&ct->refcnt);
+		return 0;
+	}
+	return 1;
+}
+
+static void ip_vs_conn_expire(unsigned long data)
+{
+	struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
+
+	cp->timeout = 60*HZ;
+
+	/*
+	 *	hey, I'm using it
+	 */
+	atomic_inc(&cp->refcnt);
+
+	/*
+	 *	do I control anybody?
+	 */
+	if (atomic_read(&cp->n_control))
+		goto expire_later;
+
+	/*
+	 *	unhash it if it is hashed in the conn table
+	 */
+	if (!ip_vs_conn_unhash(cp))
+		goto expire_later;
+
+	/*
+	 *	refcnt==1 implies I'm the only one referrer
+	 */
+	if (likely(atomic_read(&cp->refcnt) == 1)) {
+		/* delete the timer if it is activated by other users */
+		if (timer_pending(&cp->timer))
+			del_timer(&cp->timer);
+
+		/* does anybody control me? */
+		if (cp->control)
+			ip_vs_control_del(cp);
+
+		if (unlikely(cp->app != NULL))
+			ip_vs_unbind_app(cp);
+		ip_vs_unbind_dest(cp);
+		if (cp->flags & IP_VS_CONN_F_NO_CPORT)
+			atomic_dec(&ip_vs_conn_no_cport_cnt);
+		atomic_dec(&ip_vs_conn_count);
+
+		kmem_cache_free(ip_vs_conn_cachep, cp);
+		return;
+	}
+
+	/* hash it back to the table */
+	ip_vs_conn_hash(cp);
+
+  expire_later:
+	IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n",
+		  atomic_read(&cp->refcnt)-1,
+		  atomic_read(&cp->n_control));
+
+	ip_vs_conn_put(cp);
+}
+
+
+void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
+{
+	if (del_timer(&cp->timer))
+		mod_timer(&cp->timer, jiffies);
+	__ip_vs_conn_put(cp);
+}
+
+
+/*
+ *	Create a new connection entry and hash it into the ip_vs_conn_tab
+ */
+struct ip_vs_conn *
+ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
+	       __u32 daddr, __u16 dport, unsigned flags,
+	       struct ip_vs_dest *dest)
+{
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
+	if (cp == NULL) {
+		IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
+		return NULL;
+	}
+
+	memset(cp, 0, sizeof(*cp));
+	INIT_LIST_HEAD(&cp->c_list);
+	init_timer(&cp->timer);
+	cp->timer.data     = (unsigned long)cp;
+	cp->timer.function = ip_vs_conn_expire;
+	cp->protocol	   = proto;
+	cp->caddr	   = caddr;
+	cp->cport	   = cport;
+	cp->vaddr	   = vaddr;
+	cp->vport	   = vport;
+	cp->daddr          = daddr;
+	cp->dport          = dport;
+	cp->flags	   = flags;
+	spin_lock_init(&cp->lock);
+
+	/*
+	 * Set the entry is referenced by the current thread before hashing
+	 * it in the table, so that other thread run ip_vs_random_dropentry
+	 * but cannot drop this entry.
+	 */
+	atomic_set(&cp->refcnt, 1);
+
+	atomic_set(&cp->n_control, 0);
+	atomic_set(&cp->in_pkts, 0);
+
+	atomic_inc(&ip_vs_conn_count);
+	if (flags & IP_VS_CONN_F_NO_CPORT)
+		atomic_inc(&ip_vs_conn_no_cport_cnt);
+
+	/* Bind the connection with a destination server */
+	ip_vs_bind_dest(cp, dest);
+
+	/* Set its state and timeout */
+	cp->state = 0;
+	cp->timeout = 3*HZ;
+
+	/* Bind its packet transmitter */
+	ip_vs_bind_xmit(cp);
+
+	if (unlikely(pp && atomic_read(&pp->appcnt)))
+		ip_vs_bind_app(cp, pp);
+
+	/* Hash it in the ip_vs_conn_tab finally */
+	ip_vs_conn_hash(cp);
+
+	return cp;
+}
+
+
+/*
+ *	/proc/net/ip_vs_conn entries
+ */
+#ifdef CONFIG_PROC_FS
+
+static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	
+	for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+		ct_read_lock_bh(idx);
+		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+			if (pos-- == 0) {
+				seq->private = &ip_vs_conn_tab[idx];
+				return cp;
+			}
+		}
+		ct_read_unlock_bh(idx);
+	}
+
+	return NULL;
+}
+
+static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	seq->private = NULL;
+	return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+}
+
+static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct ip_vs_conn *cp = v;
+	struct list_head *e, *l = seq->private;
+	int idx;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) 
+		return ip_vs_conn_array(seq, 0);
+
+	/* more on same hash chain? */
+	if ((e = cp->c_list.next) != l)
+		return list_entry(e, struct ip_vs_conn, c_list);
+
+	idx = l - ip_vs_conn_tab;
+	ct_read_unlock_bh(idx);
+
+	while (++idx < IP_VS_CONN_TAB_SIZE) {
+		ct_read_lock_bh(idx);
+		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+			seq->private = &ip_vs_conn_tab[idx];
+			return cp;
+		}	
+		ct_read_unlock_bh(idx);
+	}
+	seq->private = NULL;
+	return NULL;
+}
+
+static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
+{
+	struct list_head *l = seq->private;
+
+	if (l)
+		ct_read_unlock_bh(l - ip_vs_conn_tab);
+}
+
+static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
+{
+
+	if (v == SEQ_START_TOKEN)
+		seq_puts(seq,
+   "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
+	else {
+		const struct ip_vs_conn *cp = v;
+
+		seq_printf(seq,
+			"%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
+				ip_vs_proto_name(cp->protocol),
+				ntohl(cp->caddr), ntohs(cp->cport),
+				ntohl(cp->vaddr), ntohs(cp->vport),
+				ntohl(cp->daddr), ntohs(cp->dport),
+				ip_vs_state_name(cp->protocol, cp->state),
+				(cp->timer.expires-jiffies)/HZ);
+	}
+	return 0;
+}
+
+static struct seq_operations ip_vs_conn_seq_ops = {
+	.start = ip_vs_conn_seq_start,
+	.next  = ip_vs_conn_seq_next,
+	.stop  = ip_vs_conn_seq_stop,
+	.show  = ip_vs_conn_seq_show,
+};
+
+static int ip_vs_conn_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ip_vs_conn_seq_ops);
+}
+
+static struct file_operations ip_vs_conn_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_conn_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+#endif
+
+
+/*
+ *      Randomly drop connection entries before running out of memory
+ */
+static inline int todrop_entry(struct ip_vs_conn *cp)
+{
+	/*
+	 * The drop rate array needs tuning for real environments.
+	 * Called from timer bh only => no locking
+	 */
+	static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
+	static char todrop_counter[9] = {0};
+	int i;
+
+	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
+	   This will leave enough time for normal connection to get
+	   through. */
+	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
+		return 0;
+
+	/* Don't drop the entry if its number of incoming packets is not
+	   located in [0, 8] */
+	i = atomic_read(&cp->in_pkts);
+	if (i > 8 || i < 0) return 0;
+
+	if (!todrop_rate[i]) return 0;
+	if (--todrop_counter[i] > 0) return 0;
+
+	todrop_counter[i] = todrop_rate[i];
+	return 1;
+}
+
+
+void ip_vs_random_dropentry(void)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn *ct;
+
+	/*
+	 * Randomly scan 1/32 of the whole table every second
+	 */
+	for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
+		unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
+
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock(hash);
+
+		list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
+			if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT))
+				/* connection template */
+				continue;
+
+			if (cp->protocol == IPPROTO_TCP) {
+				switch(cp->state) {
+				case IP_VS_TCP_S_SYN_RECV:
+				case IP_VS_TCP_S_SYNACK:
+					break;
+
+				case IP_VS_TCP_S_ESTABLISHED:
+					if (todrop_entry(cp))
+						break;
+					continue;
+
+				default:
+					continue;
+				}
+			} else {
+				if (!todrop_entry(cp))
+					continue;
+			}
+
+			/*
+			 * Drop the entry, and drop its ct if not referenced
+			 */
+			atomic_inc(&cp->refcnt);
+			ct_write_unlock(hash);
+
+			if ((ct = cp->control))
+				atomic_inc(&ct->refcnt);
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (ct) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(ct);
+			}
+			ct_write_lock(hash);
+		}
+		ct_write_unlock(hash);
+	}
+}
+
+
+/*
+ *      Flush all the connection entries in the ip_vs_conn_tab
+ */
+static void ip_vs_conn_flush(void)
+{
+	int idx;
+	struct ip_vs_conn *cp;
+	struct ip_vs_conn *ct;
+
+  flush_again:
+	for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
+		/*
+		 *  Lock is actually needed in this loop.
+		 */
+		ct_write_lock_bh(idx);
+
+		list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
+			atomic_inc(&cp->refcnt);
+			ct_write_unlock(idx);
+
+			if ((ct = cp->control))
+				atomic_inc(&ct->refcnt);
+			IP_VS_DBG(4, "del connection\n");
+			ip_vs_conn_expire_now(cp);
+			if (ct) {
+				IP_VS_DBG(4, "del conn template\n");
+				ip_vs_conn_expire_now(ct);
+			}
+			ct_write_lock(idx);
+		}
+		ct_write_unlock_bh(idx);
+	}
+
+	/* the counter may be not NULL, because maybe some conn entries
+	   are run by slow timer handler or unhashed but still referred */
+	if (atomic_read(&ip_vs_conn_count) != 0) {
+		schedule();
+		goto flush_again;
+	}
+}
+
+
+int ip_vs_conn_init(void)
+{
+	int idx;
+
+	/*
+	 * Allocate the connection hash table and initialize its list heads
+	 */
+	ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
+	if (!ip_vs_conn_tab)
+		return -ENOMEM;
+
+	/* Allocate ip_vs_conn slab cache */
+	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
+					      sizeof(struct ip_vs_conn), 0,
+					      SLAB_HWCACHE_ALIGN, NULL, NULL);
+	if (!ip_vs_conn_cachep) {
+		vfree(ip_vs_conn_tab);
+		return -ENOMEM;
+	}
+
+	IP_VS_INFO("Connection hash table configured "
+		   "(size=%d, memory=%ldKbytes)\n",
+		   IP_VS_CONN_TAB_SIZE,
+		   (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
+	IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
+		  sizeof(struct ip_vs_conn));
+
+	for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
+		INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
+	}
+
+	for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
+		rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
+	}
+
+	proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
+
+	/* calculate the random value for connection hash */
+	get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
+
+	return 0;
+}
+
+
+void ip_vs_conn_cleanup(void)
+{
+	/* flush all the connection entries first */
+	ip_vs_conn_flush();
+
+	/* Release the empty cache */
+	kmem_cache_destroy(ip_vs_conn_cachep);
+	proc_net_remove("ip_vs_conn");
+	vfree(ip_vs_conn_tab);
+}
diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c
new file mode 100644
index 000000000000..5fb257dd07cb
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_core.c
@@ -0,0 +1,1191 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
+ * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
+ * and others.
+ *
+ * Changes:
+ *	Paul `Rusty' Russell		properly handle non-linear skbs
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+EXPORT_SYMBOL(register_ip_vs_scheduler);
+EXPORT_SYMBOL(unregister_ip_vs_scheduler);
+EXPORT_SYMBOL(ip_vs_skb_replace);
+EXPORT_SYMBOL(ip_vs_proto_name);
+EXPORT_SYMBOL(ip_vs_conn_new);
+EXPORT_SYMBOL(ip_vs_conn_in_get);
+EXPORT_SYMBOL(ip_vs_conn_out_get);
+#ifdef CONFIG_IP_VS_PROTO_TCP
+EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
+#endif
+EXPORT_SYMBOL(ip_vs_conn_put);
+#ifdef CONFIG_IP_VS_DEBUG
+EXPORT_SYMBOL(ip_vs_get_debug_level);
+#endif
+EXPORT_SYMBOL(ip_vs_make_skb_writable);
+
+
+/* ID used in ICMP lookups */
+#define icmp_id(icmph)          (((icmph)->un).echo.id)
+
+const char *ip_vs_proto_name(unsigned proto)
+{
+	static char buf[20];
+
+	switch (proto) {
+	case IPPROTO_IP:
+		return "IP";
+	case IPPROTO_UDP:
+		return "UDP";
+	case IPPROTO_TCP:
+		return "TCP";
+	case IPPROTO_ICMP:
+		return "ICMP";
+	default:
+		sprintf(buf, "IP_%d", proto);
+		return buf;
+	}
+}
+
+void ip_vs_init_hash_table(struct list_head *table, int rows)
+{
+	while (--rows >= 0)
+		INIT_LIST_HEAD(&table[rows]);
+}
+
+static inline void
+ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest = cp->dest;
+	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		spin_lock(&dest->stats.lock);
+		dest->stats.inpkts++;
+		dest->stats.inbytes += skb->len;
+		spin_unlock(&dest->stats.lock);
+
+		spin_lock(&dest->svc->stats.lock);
+		dest->svc->stats.inpkts++;
+		dest->svc->stats.inbytes += skb->len;
+		spin_unlock(&dest->svc->stats.lock);
+
+		spin_lock(&ip_vs_stats.lock);
+		ip_vs_stats.inpkts++;
+		ip_vs_stats.inbytes += skb->len;
+		spin_unlock(&ip_vs_stats.lock);
+	}
+}
+
+
+static inline void
+ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest = cp->dest;
+	if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		spin_lock(&dest->stats.lock);
+		dest->stats.outpkts++;
+		dest->stats.outbytes += skb->len;
+		spin_unlock(&dest->stats.lock);
+
+		spin_lock(&dest->svc->stats.lock);
+		dest->svc->stats.outpkts++;
+		dest->svc->stats.outbytes += skb->len;
+		spin_unlock(&dest->svc->stats.lock);
+
+		spin_lock(&ip_vs_stats.lock);
+		ip_vs_stats.outpkts++;
+		ip_vs_stats.outbytes += skb->len;
+		spin_unlock(&ip_vs_stats.lock);
+	}
+}
+
+
+static inline void
+ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
+{
+	spin_lock(&cp->dest->stats.lock);
+	cp->dest->stats.conns++;
+	spin_unlock(&cp->dest->stats.lock);
+
+	spin_lock(&svc->stats.lock);
+	svc->stats.conns++;
+	spin_unlock(&svc->stats.lock);
+
+	spin_lock(&ip_vs_stats.lock);
+	ip_vs_stats.conns++;
+	spin_unlock(&ip_vs_stats.lock);
+}
+
+
+static inline int
+ip_vs_set_state(struct ip_vs_conn *cp, int direction,
+		const struct sk_buff *skb,
+		struct ip_vs_protocol *pp)
+{
+	if (unlikely(!pp->state_transition))
+		return 0;
+	return pp->state_transition(cp, direction, skb, pp);
+}
+
+
+int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
+{
+	struct sk_buff *skb = *pskb;
+
+	/* skb is already used, better copy skb and its payload */
+	if (unlikely(skb_shared(skb) || skb->sk))
+		goto copy_skb;
+
+	/* skb data is already used, copy it */
+	if (unlikely(skb_cloned(skb)))
+		goto copy_data;
+
+	return pskb_may_pull(skb, writable_len);
+
+  copy_data:
+	if (unlikely(writable_len > skb->len))
+		return 0;
+	return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
+
+  copy_skb:
+	if (unlikely(writable_len > skb->len))
+		return 0;
+	skb = skb_copy(skb, GFP_ATOMIC);
+	if (!skb)
+		return 0;
+	BUG_ON(skb_is_nonlinear(skb));
+
+	/* Rest of kernel will get very unhappy if we pass it a
+	   suddenly-orphaned skbuff */
+	if ((*pskb)->sk)
+		skb_set_owner_w(skb, (*pskb)->sk);
+	kfree_skb(*pskb);
+	*pskb = skb;
+	return 1;
+}
+
+/*
+ *  IPVS persistent scheduling function
+ *  It creates a connection entry according to its template if exists,
+ *  or selects a server and creates a connection entry plus a template.
+ *  Locking: we are svc user (svc->refcnt), so we hold all dests too
+ *  Protocols supported: TCP, UDP
+ */
+static struct ip_vs_conn *
+ip_vs_sched_persist(struct ip_vs_service *svc,
+		    const struct sk_buff *skb,
+		    __u16 ports[2])
+{
+	struct ip_vs_conn *cp = NULL;
+	struct iphdr *iph = skb->nh.iph;
+	struct ip_vs_dest *dest;
+	struct ip_vs_conn *ct;
+	__u16  dport;	 /* destination port to forward */
+	__u32  snet;	 /* source network of the client, after masking */
+
+	/* Mask saddr with the netmask to adjust template granularity */
+	snet = iph->saddr & svc->netmask;
+
+	IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
+		  "mnet %u.%u.%u.%u\n",
+		  NIPQUAD(iph->saddr), ntohs(ports[0]),
+		  NIPQUAD(iph->daddr), ntohs(ports[1]),
+		  NIPQUAD(snet));
+
+	/*
+	 * As far as we know, FTP is a very complicated network protocol, and
+	 * it uses control connection and data connections. For active FTP,
+	 * FTP server initialize data connection to the client, its source port
+	 * is often 20. For passive FTP, FTP server tells the clients the port
+	 * that it passively listens to,  and the client issues the data
+	 * connection. In the tunneling or direct routing mode, the load
+	 * balancer is on the client-to-server half of connection, the port
+	 * number is unknown to the load balancer. So, a conn template like
+	 * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
+	 * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
+	 * is created for other persistent services.
+	 */
+	if (ports[1] == svc->port) {
+		/* Check if a template already exists */
+		if (svc->port != FTPPORT)
+			ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+					       iph->daddr, ports[1]);
+		else
+			ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+					       iph->daddr, 0);
+
+		if (!ct || !ip_vs_check_template(ct)) {
+			/*
+			 * No template found or the dest of the connection
+			 * template is not available.
+			 */
+			dest = svc->scheduler->schedule(svc, skb);
+			if (dest == NULL) {
+				IP_VS_DBG(1, "p-schedule: no dest found.\n");
+				return NULL;
+			}
+
+			/*
+			 * Create a template like <protocol,caddr,0,
+			 * vaddr,vport,daddr,dport> for non-ftp service,
+			 * and <protocol,caddr,0,vaddr,0,daddr,0>
+			 * for ftp service.
+			 */
+			if (svc->port != FTPPORT)
+				ct = ip_vs_conn_new(iph->protocol,
+						    snet, 0,
+						    iph->daddr,
+						    ports[1],
+						    dest->addr, dest->port,
+						    0,
+						    dest);
+			else
+				ct = ip_vs_conn_new(iph->protocol,
+						    snet, 0,
+						    iph->daddr, 0,
+						    dest->addr, 0,
+						    0,
+						    dest);
+			if (ct == NULL)
+				return NULL;
+
+			ct->timeout = svc->timeout;
+		} else {
+			/* set destination with the found template */
+			dest = ct->dest;
+		}
+		dport = dest->port;
+	} else {
+		/*
+		 * Note: persistent fwmark-based services and persistent
+		 * port zero service are handled here.
+		 * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
+		 * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
+		 */
+		if (svc->fwmark)
+			ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0,
+					       htonl(svc->fwmark), 0);
+		else
+			ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
+					       iph->daddr, 0);
+
+		if (!ct || !ip_vs_check_template(ct)) {
+			/*
+			 * If it is not persistent port zero, return NULL,
+			 * otherwise create a connection template.
+			 */
+			if (svc->port)
+				return NULL;
+
+			dest = svc->scheduler->schedule(svc, skb);
+			if (dest == NULL) {
+				IP_VS_DBG(1, "p-schedule: no dest found.\n");
+				return NULL;
+			}
+
+			/*
+			 * Create a template according to the service
+			 */
+			if (svc->fwmark)
+				ct = ip_vs_conn_new(IPPROTO_IP,
+						    snet, 0,
+						    htonl(svc->fwmark), 0,
+						    dest->addr, 0,
+						    0,
+						    dest);
+			else
+				ct = ip_vs_conn_new(iph->protocol,
+						    snet, 0,
+						    iph->daddr, 0,
+						    dest->addr, 0,
+						    0,
+						    dest);
+			if (ct == NULL)
+				return NULL;
+
+			ct->timeout = svc->timeout;
+		} else {
+			/* set destination with the found template */
+			dest = ct->dest;
+		}
+		dport = ports[1];
+	}
+
+	/*
+	 *    Create a new connection according to the template
+	 */
+	cp = ip_vs_conn_new(iph->protocol,
+			    iph->saddr, ports[0],
+			    iph->daddr, ports[1],
+			    dest->addr, dport,
+			    0,
+			    dest);
+	if (cp == NULL) {
+		ip_vs_conn_put(ct);
+		return NULL;
+	}
+
+	/*
+	 *    Add its control
+	 */
+	ip_vs_control_add(cp, ct);
+	ip_vs_conn_put(ct);
+
+	ip_vs_conn_stats(cp, svc);
+	return cp;
+}
+
+
+/*
+ *  IPVS main scheduling function
+ *  It selects a server according to the virtual service, and
+ *  creates a connection entry.
+ *  Protocols supported: TCP, UDP
+ */
+struct ip_vs_conn *
+ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_conn *cp = NULL;
+	struct iphdr *iph = skb->nh.iph;
+	struct ip_vs_dest *dest;
+	__u16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, iph->ihl*4,
+				  sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	/*
+	 *    Persistent service
+	 */
+	if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+		return ip_vs_sched_persist(svc, skb, pptr);
+
+	/*
+	 *    Non-persistent service
+	 */
+	if (!svc->fwmark && pptr[1] != svc->port) {
+		if (!svc->port)
+			IP_VS_ERR("Schedule: port zero only supported "
+				  "in persistent services, "
+				  "check your ipvs configuration\n");
+		return NULL;
+	}
+
+	dest = svc->scheduler->schedule(svc, skb);
+	if (dest == NULL) {
+		IP_VS_DBG(1, "Schedule: no dest found.\n");
+		return NULL;
+	}
+
+	/*
+	 *    Create a connection entry.
+	 */
+	cp = ip_vs_conn_new(iph->protocol,
+			    iph->saddr, pptr[0],
+			    iph->daddr, pptr[1],
+			    dest->addr, dest->port?dest->port:pptr[1],
+			    0,
+			    dest);
+	if (cp == NULL)
+		return NULL;
+
+	IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
+		  "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+		  ip_vs_fwd_tag(cp),
+		  NIPQUAD(cp->caddr), ntohs(cp->cport),
+		  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+		  NIPQUAD(cp->daddr), ntohs(cp->dport),
+		  cp->flags, atomic_read(&cp->refcnt));
+
+	ip_vs_conn_stats(cp, svc);
+	return cp;
+}
+
+
+/*
+ *  Pass or drop the packet.
+ *  Called by ip_vs_in, when the virtual service is available but
+ *  no destination is available for a new connection.
+ */
+int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
+		struct ip_vs_protocol *pp)
+{
+	__u16 _ports[2], *pptr;
+	struct iphdr *iph = skb->nh.iph;
+
+	pptr = skb_header_pointer(skb, iph->ihl*4,
+				  sizeof(_ports), _ports);
+	if (pptr == NULL) {
+		ip_vs_service_put(svc);
+		return NF_DROP;
+	}
+
+	/* if it is fwmark-based service, the cache_bypass sysctl is up
+	   and the destination is RTN_UNICAST (and not local), then create
+	   a cache_bypass connection entry */
+	if (sysctl_ip_vs_cache_bypass && svc->fwmark
+	    && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+		int ret, cs;
+		struct ip_vs_conn *cp;
+
+		ip_vs_service_put(svc);
+
+		/* create a new connection entry */
+		IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
+		cp = ip_vs_conn_new(iph->protocol,
+				    iph->saddr, pptr[0],
+				    iph->daddr, pptr[1],
+				    0, 0,
+				    IP_VS_CONN_F_BYPASS,
+				    NULL);
+		if (cp == NULL)
+			return NF_DROP;
+
+		/* statistics */
+		ip_vs_in_stats(cp, skb);
+
+		/* set state */
+		cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+
+		/* transmit the first SYN packet */
+		ret = cp->packet_xmit(skb, cp, pp);
+		/* do not touch skb anymore */
+
+		atomic_inc(&cp->in_pkts);
+		ip_vs_conn_put(cp);
+		return ret;
+	}
+
+	/*
+	 * When the virtual ftp service is presented, packets destined
+	 * for other services on the VIP may get here (except services
+	 * listed in the ipvs table), pass the packets, because it is
+	 * not ipvs job to decide to drop the packets.
+	 */
+	if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
+		ip_vs_service_put(svc);
+		return NF_ACCEPT;
+	}
+
+	ip_vs_service_put(svc);
+
+	/*
+	 * Notify the client that the destination is unreachable, and
+	 * release the socket buffer.
+	 * Since it is in IP layer, the TCP socket is not actually
+	 * created, the TCP RST packet cannot be sent, instead that
+	 * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
+	 */
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+	return NF_DROP;
+}
+
+
+/*
+ *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ *      chain, and is used for VS/NAT.
+ *      It detects packets for VS/NAT connections and sends the packets
+ *      immediately. This can avoid that iptable_nat mangles the packets
+ *      for VS/NAT.
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+				       struct sk_buff **pskb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
+		return NF_ACCEPT;
+
+	/* The packet was sent from IPVS, exit this chain */
+	(*okfn)(*pskb);
+
+	return NF_STOLEN;
+}
+
+u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+{
+	return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+}
+
+static inline struct sk_buff *
+ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+	skb = ip_defrag(skb, user);
+	if (skb)
+		ip_send_check(skb->nh.iph);
+	return skb;
+}
+
+/*
+ * Packet has been made sufficiently writable in caller
+ * - inout: 1=in->out, 0=out->in
+ */
+void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		    struct ip_vs_conn *cp, int inout)
+{
+	struct iphdr *iph	 = skb->nh.iph;
+	unsigned int icmp_offset = iph->ihl*4;
+	struct icmphdr *icmph	 = (struct icmphdr *)(skb->nh.raw + icmp_offset);
+	struct iphdr *ciph	 = (struct iphdr *)(icmph + 1);
+
+	if (inout) {
+		iph->saddr = cp->vaddr;
+		ip_send_check(iph);
+		ciph->daddr = cp->vaddr;
+		ip_send_check(ciph);
+	} else {
+		iph->daddr = cp->daddr;
+		ip_send_check(iph);
+		ciph->saddr = cp->daddr;
+		ip_send_check(ciph);
+	}
+
+	/* the TCP/UDP port */
+	if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
+		__u16 *ports = (void *)ciph + ciph->ihl*4;
+
+		if (inout)
+			ports[1] = cp->vport;
+		else
+			ports[0] = cp->dport;
+	}
+
+	/* And finally the ICMP checksum */
+	icmph->checksum = 0;
+	icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	if (inout)
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered outgoing ICMP");
+	else
+		IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
+			"Forwarding altered incoming ICMP");
+}
+
+/*
+ *	Handle ICMP messages in the inside-to-outside direction (outgoing).
+ *	Find any that might be relevant, check against existing connections,
+ *	forward to the right destination host if relevant.
+ *	Currently handles error types - unreachable, quench, ttl exceeded.
+ *	(Only used in VS/NAT)
+ */
+static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
+{
+	struct sk_buff *skb = *pskb;
+	struct iphdr *iph;
+	struct icmphdr	_icmph, *ic;
+	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, ihl, verdict;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+		skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+		if (!skb)
+			return NF_STOLEN;
+		*pskb = skb;
+	}
+
+	iph = skb->nh.iph;
+	offset = ihl = iph->ihl * 4;
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+		  ic->type, ntohs(icmp_id(ic)),
+		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->type != ICMP_DEST_UNREACH) &&
+	    (ic->type != ICMP_SOURCE_QUENCH) &&
+	    (ic->type != ICMP_TIME_EXCEEDED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->protocol);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+		     pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
+
+	offset += cih->ihl * 4;
+
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_out_get(skb, pp, cih, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	verdict = NF_DROP;
+
+	if (IP_VS_FWD_METHOD(cp) != 0) {
+		IP_VS_ERR("shouldn't reach here, because the box is on the"
+			  "half connection in the tun/dr module.\n");
+	}
+
+	/* Ensure the checksum is correct */
+	if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+	    ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
+			  NIPQUAD(iph->saddr));
+		goto out;
+	}
+
+	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+		offset += 2 * sizeof(__u16);
+	if (!ip_vs_make_skb_writable(pskb, offset))
+		goto out;
+	skb = *pskb;
+
+	ip_vs_nat_icmp(skb, pp, cp, 1);
+
+	/* do the statistics and put it back */
+	ip_vs_out_stats(cp, skb);
+
+	skb->nfcache |= NFC_IPVS_PROPERTY;
+	verdict = NF_ACCEPT;
+
+  out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
+static inline int is_tcp_reset(const struct sk_buff *skb)
+{
+	struct tcphdr _tcph, *th;
+
+	th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+	return th->rst;
+}
+
+/*
+ *	It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ *	Check if outgoing packet belongs to the established ip_vs_conn,
+ *      rewrite addresses of the packet and send it on its way...
+ */
+static unsigned int
+ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
+	  const struct net_device *in, const struct net_device *out,
+	  int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff  *skb = *pskb;
+	struct iphdr	*iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	int ihl;
+
+	EnterFunction(11);
+
+	if (skb->nfcache & NFC_IPVS_PROPERTY)
+		return NF_ACCEPT;
+
+	iph = skb->nh.iph;
+	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+		int related, verdict = ip_vs_out_icmp(pskb, &related);
+
+		if (related)
+			return verdict;
+		skb = *pskb;
+		iph = skb->nh.iph;
+	}
+
+	pp = ip_vs_proto_get(iph->protocol);
+	if (unlikely(!pp))
+		return NF_ACCEPT;
+
+	/* reassemble IP fragments */
+	if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
+		     !pp->dont_defrag)) {
+		skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
+		if (!skb)
+			return NF_STOLEN;
+		iph = skb->nh.iph;
+		*pskb = skb;
+	}
+
+	ihl = iph->ihl << 2;
+
+	/*
+	 * Check if the packet belongs to an existing entry
+	 */
+	cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
+
+	if (unlikely(!cp)) {
+		if (sysctl_ip_vs_nat_icmp_send &&
+		    (pp->protocol == IPPROTO_TCP ||
+		     pp->protocol == IPPROTO_UDP)) {
+			__u16 _ports[2], *pptr;
+
+			pptr = skb_header_pointer(skb, ihl,
+						  sizeof(_ports), _ports);
+			if (pptr == NULL)
+				return NF_ACCEPT;	/* Not for me */
+			if (ip_vs_lookup_real_service(iph->protocol,
+						      iph->saddr, pptr[0])) {
+				/*
+				 * Notify the real server: there is no
+				 * existing entry if it is not RST
+				 * packet or not TCP packet.
+				 */
+				if (iph->protocol != IPPROTO_TCP
+				    || !is_tcp_reset(skb)) {
+					icmp_send(skb,ICMP_DEST_UNREACH,
+						  ICMP_PORT_UNREACH, 0);
+					return NF_DROP;
+				}
+			}
+		}
+		IP_VS_DBG_PKT(12, pp, skb, 0,
+			      "packet continues traversal as normal");
+		return NF_ACCEPT;
+	}
+
+	IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
+
+	if (!ip_vs_make_skb_writable(pskb, ihl))
+		goto drop;
+
+	/* mangle the packet */
+	if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
+		goto drop;
+	skb = *pskb;
+	skb->nh.iph->saddr = cp->vaddr;
+	ip_send_check(skb->nh.iph);
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
+
+	ip_vs_out_stats(cp, skb);
+	ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
+	ip_vs_conn_put(cp);
+
+	skb->nfcache |= NFC_IPVS_PROPERTY;
+
+	LeaveFunction(11);
+	return NF_ACCEPT;
+
+  drop:
+	ip_vs_conn_put(cp);
+	kfree_skb(*pskb);
+	return NF_STOLEN;
+}
+
+
+/*
+ *	Handle ICMP messages in the outside-to-inside direction (incoming).
+ *	Find any that might be relevant, check against existing connections,
+ *	forward to the right destination host if relevant.
+ *	Currently handles error types - unreachable, quench, ttl exceeded.
+ */
+static int 
+ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
+{
+	struct sk_buff *skb = *pskb;
+	struct iphdr *iph;
+	struct icmphdr	_icmph, *ic;
+	struct iphdr	_ciph, *cih;	/* The ip header contained within the ICMP */
+	struct ip_vs_conn *cp;
+	struct ip_vs_protocol *pp;
+	unsigned int offset, ihl, verdict;
+
+	*related = 1;
+
+	/* reassemble IP fragments */
+	if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
+		skb = ip_vs_gather_frags(skb,
+		                         hooknum == NF_IP_LOCAL_IN ?
+					 IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
+		if (!skb)
+			return NF_STOLEN;
+		*pskb = skb;
+	}
+
+	iph = skb->nh.iph;
+	offset = ihl = iph->ihl * 4;
+	ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
+	if (ic == NULL)
+		return NF_DROP;
+
+	IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
+		  ic->type, ntohs(icmp_id(ic)),
+		  NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+
+	/*
+	 * Work through seeing if this is for us.
+	 * These checks are supposed to be in an order that means easy
+	 * things are checked first to speed up processing.... however
+	 * this means that some packets will manage to get a long way
+	 * down this stack and then be rejected, but that's life.
+	 */
+	if ((ic->type != ICMP_DEST_UNREACH) &&
+	    (ic->type != ICMP_SOURCE_QUENCH) &&
+	    (ic->type != ICMP_TIME_EXCEEDED)) {
+		*related = 0;
+		return NF_ACCEPT;
+	}
+
+	/* Now find the contained IP header */
+	offset += sizeof(_icmph);
+	cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
+	if (cih == NULL)
+		return NF_ACCEPT; /* The packet looks wrong, ignore */
+
+	pp = ip_vs_proto_get(cih->protocol);
+	if (!pp)
+		return NF_ACCEPT;
+
+	/* Is the embedded protocol header present? */
+	if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+		     pp->dont_defrag))
+		return NF_ACCEPT;
+
+	IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
+
+	offset += cih->ihl * 4;
+
+	/* The embedded headers contain source and dest in reverse order */
+	cp = pp->conn_in_get(skb, pp, cih, offset, 1);
+	if (!cp)
+		return NF_ACCEPT;
+
+	verdict = NF_DROP;
+
+	/* Ensure the checksum is correct */
+	if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
+	    ip_vs_checksum_complete(skb, ihl)) {
+		/* Failed checksum! */
+		IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
+			  NIPQUAD(iph->saddr));
+		goto out;
+	}
+
+	/* do the statistics and put it back */
+	ip_vs_in_stats(cp, skb);
+	if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
+		offset += 2 * sizeof(__u16);
+	verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
+	/* do not touch skb anymore */
+
+  out:
+	__ip_vs_conn_put(cp);
+
+	return verdict;
+}
+
+/*
+ *	Check if it's for virtual services, look it up,
+ *	and send it on its way...
+ */
+static unsigned int
+ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
+	 const struct net_device *in, const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	struct sk_buff	*skb = *pskb;
+	struct iphdr	*iph;
+	struct ip_vs_protocol *pp;
+	struct ip_vs_conn *cp;
+	int ret, restart;
+	int ihl;
+
+	/*
+	 *	Big tappo: only PACKET_HOST (neither loopback nor mcasts)
+	 *	... don't know why 1st test DOES NOT include 2nd (?)
+	 */
+	if (unlikely(skb->pkt_type != PACKET_HOST
+		     || skb->dev == &loopback_dev || skb->sk)) {
+		IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
+			  skb->pkt_type,
+			  skb->nh.iph->protocol,
+			  NIPQUAD(skb->nh.iph->daddr));
+		return NF_ACCEPT;
+	}
+
+	iph = skb->nh.iph;
+	if (unlikely(iph->protocol == IPPROTO_ICMP)) {
+		int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
+
+		if (related)
+			return verdict;
+		skb = *pskb;
+		iph = skb->nh.iph;
+	}
+
+	/* Protocol supported? */
+	pp = ip_vs_proto_get(iph->protocol);
+	if (unlikely(!pp))
+		return NF_ACCEPT;
+
+	ihl = iph->ihl << 2;
+
+	/*
+	 * Check if the packet belongs to an existing connection entry
+	 */
+	cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
+
+	if (unlikely(!cp)) {
+		int v;
+
+		if (!pp->conn_schedule(skb, pp, &v, &cp))
+			return v;
+	}
+
+	if (unlikely(!cp)) {
+		/* sorry, all this trouble for a no-hit :) */
+		IP_VS_DBG_PKT(12, pp, skb, 0,
+			      "packet continues traversal as normal");
+		return NF_ACCEPT;
+	}
+
+	IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
+
+	/* Check the server status */
+	if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+		/* the destination server is not available */
+
+		if (sysctl_ip_vs_expire_nodest_conn) {
+			/* try to expire the connection immediately */
+			ip_vs_conn_expire_now(cp);
+		} else {
+			/* don't restart its timer, and silently
+			   drop the packet. */
+			__ip_vs_conn_put(cp);
+		}
+		return NF_DROP;
+	}
+
+	ip_vs_in_stats(cp, skb);
+	restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
+	if (cp->packet_xmit)
+		ret = cp->packet_xmit(skb, cp, pp);
+		/* do not touch skb anymore */
+	else {
+		IP_VS_DBG_RL("warning: packet_xmit is null");
+		ret = NF_ACCEPT;
+	}
+
+	/* increase its packet counter and check if it is needed
+	   to be synchronized */
+	atomic_inc(&cp->in_pkts);
+	if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
+	    (cp->protocol != IPPROTO_TCP ||
+	     cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+	    (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+	     == sysctl_ip_vs_sync_threshold[0]))
+		ip_vs_sync_conn(cp);
+
+	ip_vs_conn_put(cp);
+	return ret;
+}
+
+
+/*
+ *	It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ *      related packets destined for 0.0.0.0/0.
+ *      When fwmark-based virtual service is used, such as transparent
+ *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
+ *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
+ *      sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ *      and send them to ip_vs_in_icmp.
+ */
+static unsigned int
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
+		   const struct net_device *in, const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	int r;
+
+	if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
+		return NF_ACCEPT;
+
+	return ip_vs_in_icmp(pskb, &r, hooknum);
+}
+
+
+/* After packet filtering, forward packet through VS/DR, VS/TUN,
+   or VS/NAT(change destination), so that filtering rules can be
+   applied to IPVS. */
+static struct nf_hook_ops ip_vs_in_ops = {
+	.hook		= ip_vs_in,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum        = NF_IP_LOCAL_IN,
+	.priority       = 100,
+};
+
+/* After packet filtering, change source only for VS/NAT */
+static struct nf_hook_ops ip_vs_out_ops = {
+	.hook		= ip_vs_out,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum        = NF_IP_FORWARD,
+	.priority       = 100,
+};
+
+/* After packet filtering (but before ip_vs_out_icmp), catch icmp
+   destined for 0.0.0.0/0, which is for incoming IPVS connections */
+static struct nf_hook_ops ip_vs_forward_icmp_ops = {
+	.hook		= ip_vs_forward_icmp,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum        = NF_IP_FORWARD,
+	.priority       = 99,
+};
+
+/* Before the netfilter connection tracking, exit from POST_ROUTING */
+static struct nf_hook_ops ip_vs_post_routing_ops = {
+	.hook		= ip_vs_post_routing,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum        = NF_IP_POST_ROUTING,
+	.priority       = NF_IP_PRI_NAT_SRC-1,
+};
+
+
+/*
+ *	Initialize IP Virtual Server
+ */
+static int __init ip_vs_init(void)
+{
+	int ret;
+
+	ret = ip_vs_control_init();
+	if (ret < 0) {
+		IP_VS_ERR("can't setup control.\n");
+		goto cleanup_nothing;
+	}
+
+	ip_vs_protocol_init();
+
+	ret = ip_vs_app_init();
+	if (ret < 0) {
+		IP_VS_ERR("can't setup application helper.\n");
+		goto cleanup_protocol;
+	}
+
+	ret = ip_vs_conn_init();
+	if (ret < 0) {
+		IP_VS_ERR("can't setup connection table.\n");
+		goto cleanup_app;
+	}
+
+	ret = nf_register_hook(&ip_vs_in_ops);
+	if (ret < 0) {
+		IP_VS_ERR("can't register in hook.\n");
+		goto cleanup_conn;
+	}
+
+	ret = nf_register_hook(&ip_vs_out_ops);
+	if (ret < 0) {
+		IP_VS_ERR("can't register out hook.\n");
+		goto cleanup_inops;
+	}
+	ret = nf_register_hook(&ip_vs_post_routing_ops);
+	if (ret < 0) {
+		IP_VS_ERR("can't register post_routing hook.\n");
+		goto cleanup_outops;
+	}
+	ret = nf_register_hook(&ip_vs_forward_icmp_ops);
+	if (ret < 0) {
+		IP_VS_ERR("can't register forward_icmp hook.\n");
+		goto cleanup_postroutingops;
+	}
+
+	IP_VS_INFO("ipvs loaded.\n");
+	return ret;
+
+  cleanup_postroutingops:
+	nf_unregister_hook(&ip_vs_post_routing_ops);
+  cleanup_outops:
+	nf_unregister_hook(&ip_vs_out_ops);
+  cleanup_inops:
+	nf_unregister_hook(&ip_vs_in_ops);
+  cleanup_conn:
+	ip_vs_conn_cleanup();
+  cleanup_app:
+	ip_vs_app_cleanup();
+  cleanup_protocol:
+	ip_vs_protocol_cleanup();
+	ip_vs_control_cleanup();
+  cleanup_nothing:
+	return ret;
+}
+
+static void __exit ip_vs_cleanup(void)
+{
+	nf_unregister_hook(&ip_vs_forward_icmp_ops);
+	nf_unregister_hook(&ip_vs_post_routing_ops);
+	nf_unregister_hook(&ip_vs_out_ops);
+	nf_unregister_hook(&ip_vs_in_ops);
+	ip_vs_conn_cleanup();
+	ip_vs_app_cleanup();
+	ip_vs_protocol_cleanup();
+	ip_vs_control_cleanup();
+	IP_VS_INFO("ipvs unloaded.\n");
+}
+
+module_init(ip_vs_init);
+module_exit(ip_vs_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
new file mode 100644
index 000000000000..218d9701036e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -0,0 +1,2391 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/swap.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip.h>
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+
+#include <net/ip_vs.h>
+
+/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
+static DECLARE_MUTEX(__ip_vs_mutex);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_svc_lock);
+
+/* lock for table with the real services */
+static DEFINE_RWLOCK(__ip_vs_rs_lock);
+
+/* lock for state and timeout tables */
+static DEFINE_RWLOCK(__ip_vs_securetcp_lock);
+
+/* lock for drop entry handling */
+static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
+
+/* lock for drop packet handling */
+static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
+
+/* 1/rate drop and drop-entry variables */
+int ip_vs_drop_rate = 0;
+int ip_vs_drop_counter = 0;
+static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
+
+/* number of virtual services */
+static int ip_vs_num_services = 0;
+
+/* sysctl variables */
+static int sysctl_ip_vs_drop_entry = 0;
+static int sysctl_ip_vs_drop_packet = 0;
+static int sysctl_ip_vs_secure_tcp = 0;
+static int sysctl_ip_vs_amemthresh = 1024;
+static int sysctl_ip_vs_am_droprate = 10;
+int sysctl_ip_vs_cache_bypass = 0;
+int sysctl_ip_vs_expire_nodest_conn = 0;
+int sysctl_ip_vs_expire_quiescent_template = 0;
+int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
+int sysctl_ip_vs_nat_icmp_send = 0;
+
+
+#ifdef CONFIG_IP_VS_DEBUG
+static int sysctl_ip_vs_debug_level = 0;
+
+int ip_vs_get_debug_level(void)
+{
+	return sysctl_ip_vs_debug_level;
+}
+#endif
+
+/*
+ *	update_defense_level is called from keventd and from sysctl.
+ */
+static void update_defense_level(void)
+{
+	struct sysinfo i;
+	static int old_secure_tcp = 0;
+	int availmem;
+	int nomem;
+	int to_change = -1;
+
+	/* we only count free and buffered memory (in pages) */
+	si_meminfo(&i);
+	availmem = i.freeram + i.bufferram;
+	/* however in linux 2.5 the i.bufferram is total page cache size,
+	   we need adjust it */
+	/* si_swapinfo(&i); */
+	/* availmem = availmem - (i.totalswap - i.freeswap); */
+
+	nomem = (availmem < sysctl_ip_vs_amemthresh);
+
+	/* drop_entry */
+	spin_lock(&__ip_vs_dropentry_lock);
+	switch (sysctl_ip_vs_drop_entry) {
+	case 0:
+		atomic_set(&ip_vs_dropentry, 0);
+		break;
+	case 1:
+		if (nomem) {
+			atomic_set(&ip_vs_dropentry, 1);
+			sysctl_ip_vs_drop_entry = 2;
+		} else {
+			atomic_set(&ip_vs_dropentry, 0);
+		}
+		break;
+	case 2:
+		if (nomem) {
+			atomic_set(&ip_vs_dropentry, 1);
+		} else {
+			atomic_set(&ip_vs_dropentry, 0);
+			sysctl_ip_vs_drop_entry = 1;
+		};
+		break;
+	case 3:
+		atomic_set(&ip_vs_dropentry, 1);
+		break;
+	}
+	spin_unlock(&__ip_vs_dropentry_lock);
+
+	/* drop_packet */
+	spin_lock(&__ip_vs_droppacket_lock);
+	switch (sysctl_ip_vs_drop_packet) {
+	case 0:
+		ip_vs_drop_rate = 0;
+		break;
+	case 1:
+		if (nomem) {
+			ip_vs_drop_rate = ip_vs_drop_counter
+				= sysctl_ip_vs_amemthresh /
+				(sysctl_ip_vs_amemthresh-availmem);
+			sysctl_ip_vs_drop_packet = 2;
+		} else {
+			ip_vs_drop_rate = 0;
+		}
+		break;
+	case 2:
+		if (nomem) {
+			ip_vs_drop_rate = ip_vs_drop_counter
+				= sysctl_ip_vs_amemthresh /
+				(sysctl_ip_vs_amemthresh-availmem);
+		} else {
+			ip_vs_drop_rate = 0;
+			sysctl_ip_vs_drop_packet = 1;
+		}
+		break;
+	case 3:
+		ip_vs_drop_rate = sysctl_ip_vs_am_droprate;
+		break;
+	}
+	spin_unlock(&__ip_vs_droppacket_lock);
+
+	/* secure_tcp */
+	write_lock(&__ip_vs_securetcp_lock);
+	switch (sysctl_ip_vs_secure_tcp) {
+	case 0:
+		if (old_secure_tcp >= 2)
+			to_change = 0;
+		break;
+	case 1:
+		if (nomem) {
+			if (old_secure_tcp < 2)
+				to_change = 1;
+			sysctl_ip_vs_secure_tcp = 2;
+		} else {
+			if (old_secure_tcp >= 2)
+				to_change = 0;
+		}
+		break;
+	case 2:
+		if (nomem) {
+			if (old_secure_tcp < 2)
+				to_change = 1;
+		} else {
+			if (old_secure_tcp >= 2)
+				to_change = 0;
+			sysctl_ip_vs_secure_tcp = 1;
+		}
+		break;
+	case 3:
+		if (old_secure_tcp < 2)
+			to_change = 1;
+		break;
+	}
+	old_secure_tcp = sysctl_ip_vs_secure_tcp;
+	if (to_change >= 0)
+		ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1);
+	write_unlock(&__ip_vs_securetcp_lock);
+}
+
+
+/*
+ *	Timer for checking the defense
+ */
+#define DEFENSE_TIMER_PERIOD	1*HZ
+static void defense_work_handler(void *data);
+static DECLARE_WORK(defense_work, defense_work_handler, NULL);
+
+static void defense_work_handler(void *data)
+{
+	update_defense_level();
+	if (atomic_read(&ip_vs_dropentry))
+		ip_vs_random_dropentry();
+
+	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+}
+
+int
+ip_vs_use_count_inc(void)
+{
+	return try_module_get(THIS_MODULE);
+}
+
+void
+ip_vs_use_count_dec(void)
+{
+	module_put(THIS_MODULE);
+}
+
+
+/*
+ *	Hash table: for virtual service lookups
+ */
+#define IP_VS_SVC_TAB_BITS 8
+#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
+#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
+
+/* the service table hashed by <protocol, addr, port> */
+static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
+/* the service table hashed by fwmark */
+static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
+
+/*
+ *	Hash table: for real service lookups
+ */
+#define IP_VS_RTAB_BITS 4
+#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
+#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
+
+static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
+
+/*
+ *	Trash for destinations
+ */
+static LIST_HEAD(ip_vs_dest_trash);
+
+/*
+ *	FTP & NULL virtual service counters
+ */
+static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
+static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
+
+
+/*
+ *	Returns hash value for virtual service
+ */
+static __inline__ unsigned
+ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port)
+{
+	register unsigned porth = ntohs(port);
+
+	return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
+		& IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *	Returns hash value of fwmark for virtual service lookup
+ */
+static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark)
+{
+	return fwmark & IP_VS_SVC_TAB_MASK;
+}
+
+/*
+ *	Hashes a service in the ip_vs_svc_table by <proto,addr,port>
+ *	or in the ip_vs_svc_fwm_table by fwmark.
+ *	Should be called with locked tables.
+ */
+static int ip_vs_svc_hash(struct ip_vs_service *svc)
+{
+	unsigned hash;
+
+	if (svc->flags & IP_VS_SVC_F_HASHED) {
+		IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	if (svc->fwmark == 0) {
+		/*
+		 *  Hash it by <protocol,addr,port> in ip_vs_svc_table
+		 */
+		hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port);
+		list_add(&svc->s_list, &ip_vs_svc_table[hash]);
+	} else {
+		/*
+		 *  Hash it by fwmark in ip_vs_svc_fwm_table
+		 */
+		hash = ip_vs_svc_fwm_hashkey(svc->fwmark);
+		list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
+	}
+
+	svc->flags |= IP_VS_SVC_F_HASHED;
+	/* increase its refcnt because it is referenced by the svc table */
+	atomic_inc(&svc->refcnt);
+	return 1;
+}
+
+
+/*
+ *	Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table.
+ *	Should be called with locked tables.
+ */
+static int ip_vs_svc_unhash(struct ip_vs_service *svc)
+{
+	if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
+		IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	if (svc->fwmark == 0) {
+		/* Remove it from the ip_vs_svc_table table */
+		list_del(&svc->s_list);
+	} else {
+		/* Remove it from the ip_vs_svc_fwm_table table */
+		list_del(&svc->f_list);
+	}
+
+	svc->flags &= ~IP_VS_SVC_F_HASHED;
+	atomic_dec(&svc->refcnt);
+	return 1;
+}
+
+
+/*
+ *	Get service by {proto,addr,port} in the service table.
+ */
+static __inline__ struct ip_vs_service *
+__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport)
+{
+	unsigned hash;
+	struct ip_vs_service *svc;
+
+	/* Check for "full" addressed entries */
+	hash = ip_vs_svc_hashkey(protocol, vaddr, vport);
+
+	list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
+		if ((svc->addr == vaddr)
+		    && (svc->port == vport)
+		    && (svc->protocol == protocol)) {
+			/* HIT */
+			atomic_inc(&svc->usecnt);
+			return svc;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *	Get service by {fwmark} in the service table.
+ */
+static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark)
+{
+	unsigned hash;
+	struct ip_vs_service *svc;
+
+	/* Check for fwmark addressed entries */
+	hash = ip_vs_svc_fwm_hashkey(fwmark);
+
+	list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
+		if (svc->fwmark == fwmark) {
+			/* HIT */
+			atomic_inc(&svc->usecnt);
+			return svc;
+		}
+	}
+
+	return NULL;
+}
+
+struct ip_vs_service *
+ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport)
+{
+	struct ip_vs_service *svc;
+
+	read_lock(&__ip_vs_svc_lock);
+
+	/*
+	 *	Check the table hashed by fwmark first
+	 */
+	if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark)))
+		goto out;
+
+	/*
+	 *	Check the table hashed by <protocol,addr,port>
+	 *	for "full" addressed entries
+	 */
+	svc = __ip_vs_service_get(protocol, vaddr, vport);
+
+	if (svc == NULL
+	    && protocol == IPPROTO_TCP
+	    && atomic_read(&ip_vs_ftpsvc_counter)
+	    && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
+		/*
+		 * Check if ftp service entry exists, the packet
+		 * might belong to FTP data connections.
+		 */
+		svc = __ip_vs_service_get(protocol, vaddr, FTPPORT);
+	}
+
+	if (svc == NULL
+	    && atomic_read(&ip_vs_nullsvc_counter)) {
+		/*
+		 * Check if the catch-all port (port zero) exists
+		 */
+		svc = __ip_vs_service_get(protocol, vaddr, 0);
+	}
+
+  out:
+	read_unlock(&__ip_vs_svc_lock);
+
+	IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n",
+		  fwmark, ip_vs_proto_name(protocol),
+		  NIPQUAD(vaddr), ntohs(vport),
+		  svc?"hit":"not hit");
+
+	return svc;
+}
+
+
+static inline void
+__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	atomic_inc(&svc->refcnt);
+	dest->svc = svc;
+}
+
+static inline void
+__ip_vs_unbind_svc(struct ip_vs_dest *dest)
+{
+	struct ip_vs_service *svc = dest->svc;
+
+	dest->svc = NULL;
+	if (atomic_dec_and_test(&svc->refcnt))
+		kfree(svc);
+}
+
+
+/*
+ *	Returns hash value for real service
+ */
+static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port)
+{
+	register unsigned porth = ntohs(port);
+
+	return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth)
+		& IP_VS_RTAB_MASK;
+}
+
+/*
+ *	Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>.
+ *	should be called with locked tables.
+ */
+static int ip_vs_rs_hash(struct ip_vs_dest *dest)
+{
+	unsigned hash;
+
+	if (!list_empty(&dest->d_list)) {
+		return 0;
+	}
+
+	/*
+	 *	Hash by proto,addr,port,
+	 *	which are the parameters of the real service.
+	 */
+	hash = ip_vs_rs_hashkey(dest->addr, dest->port);
+	list_add(&dest->d_list, &ip_vs_rtable[hash]);
+
+	return 1;
+}
+
+/*
+ *	UNhashes ip_vs_dest from ip_vs_rtable.
+ *	should be called with locked tables.
+ */
+static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
+{
+	/*
+	 * Remove it from the ip_vs_rtable table.
+	 */
+	if (!list_empty(&dest->d_list)) {
+		list_del(&dest->d_list);
+		INIT_LIST_HEAD(&dest->d_list);
+	}
+
+	return 1;
+}
+
+/*
+ *	Lookup real service by <proto,addr,port> in the real service table.
+ */
+struct ip_vs_dest *
+ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport)
+{
+	unsigned hash;
+	struct ip_vs_dest *dest;
+
+	/*
+	 *	Check for "full" addressed entries
+	 *	Return the first found entry
+	 */
+	hash = ip_vs_rs_hashkey(daddr, dport);
+
+	read_lock(&__ip_vs_rs_lock);
+	list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) {
+		if ((dest->addr == daddr)
+		    && (dest->port == dport)
+		    && ((dest->protocol == protocol) ||
+			dest->vfwmark)) {
+			/* HIT */
+			read_unlock(&__ip_vs_rs_lock);
+			return dest;
+		}
+	}
+	read_unlock(&__ip_vs_rs_lock);
+
+	return NULL;
+}
+
+/*
+ *	Lookup destination by {addr,port} in the given service
+ */
+static struct ip_vs_dest *
+ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+	struct ip_vs_dest *dest;
+
+	/*
+	 * Find the destination for the given service
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if ((dest->addr == daddr) && (dest->port == dport)) {
+			/* HIT */
+			return dest;
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *  Lookup dest by {svc,addr,port} in the destination trash.
+ *  The destination trash is used to hold the destinations that are removed
+ *  from the service table but are still referenced by some conn entries.
+ *  The reason to add the destination trash is when the dest is temporary
+ *  down (either by administrator or by monitor program), the dest can be
+ *  picked back from the trash, the remaining connections to the dest can
+ *  continue, and the counting information of the dest is also useful for
+ *  scheduling.
+ */
+static struct ip_vs_dest *
+ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport)
+{
+	struct ip_vs_dest *dest, *nxt;
+
+	/*
+	 * Find the destination in trash
+	 */
+	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+		IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, "
+			  "refcnt=%d\n",
+			  dest->vfwmark,
+			  NIPQUAD(dest->addr), ntohs(dest->port),
+			  atomic_read(&dest->refcnt));
+		if (dest->addr == daddr &&
+		    dest->port == dport &&
+		    dest->vfwmark == svc->fwmark &&
+		    dest->protocol == svc->protocol &&
+		    (svc->fwmark ||
+		     (dest->vaddr == svc->addr &&
+		      dest->vport == svc->port))) {
+			/* HIT */
+			return dest;
+		}
+
+		/*
+		 * Try to purge the destination from trash if not referenced
+		 */
+		if (atomic_read(&dest->refcnt) == 1) {
+			IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u "
+				  "from trash\n",
+				  dest->vfwmark,
+				  NIPQUAD(dest->addr), ntohs(dest->port));
+			list_del(&dest->n_list);
+			ip_vs_dst_reset(dest);
+			__ip_vs_unbind_svc(dest);
+			kfree(dest);
+		}
+	}
+
+	return NULL;
+}
+
+
+/*
+ *  Clean up all the destinations in the trash
+ *  Called by the ip_vs_control_cleanup()
+ *
+ *  When the ip_vs_control_clearup is activated by ipvs module exit,
+ *  the service tables must have been flushed and all the connections
+ *  are expired, and the refcnt of each destination in the trash must
+ *  be 1, so we simply release them here.
+ */
+static void ip_vs_trash_cleanup(void)
+{
+	struct ip_vs_dest *dest, *nxt;
+
+	list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) {
+		list_del(&dest->n_list);
+		ip_vs_dst_reset(dest);
+		__ip_vs_unbind_svc(dest);
+		kfree(dest);
+	}
+}
+
+
+static void
+ip_vs_zero_stats(struct ip_vs_stats *stats)
+{
+	spin_lock_bh(&stats->lock);
+	memset(stats, 0, (char *)&stats->lock - (char *)stats);
+	spin_unlock_bh(&stats->lock);
+	ip_vs_zero_estimator(stats);
+}
+
+/*
+ *	Update a destination in the given service
+ */
+static void
+__ip_vs_update_dest(struct ip_vs_service *svc,
+		    struct ip_vs_dest *dest, struct ip_vs_dest_user *udest)
+{
+	int conn_flags;
+
+	/* set the weight and the flags */
+	atomic_set(&dest->weight, udest->weight);
+	conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE;
+
+	/* check if local node and update the flags */
+	if (inet_addr_type(udest->addr) == RTN_LOCAL) {
+		conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK)
+			| IP_VS_CONN_F_LOCALNODE;
+	}
+
+	/* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
+	if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) {
+		conn_flags |= IP_VS_CONN_F_NOOUTPUT;
+	} else {
+		/*
+		 *    Put the real service in ip_vs_rtable if not present.
+		 *    For now only for NAT!
+		 */
+		write_lock_bh(&__ip_vs_rs_lock);
+		ip_vs_rs_hash(dest);
+		write_unlock_bh(&__ip_vs_rs_lock);
+	}
+	atomic_set(&dest->conn_flags, conn_flags);
+
+	/* bind the service */
+	if (!dest->svc) {
+		__ip_vs_bind_svc(dest, svc);
+	} else {
+		if (dest->svc != svc) {
+			__ip_vs_unbind_svc(dest);
+			ip_vs_zero_stats(&dest->stats);
+			__ip_vs_bind_svc(dest, svc);
+		}
+	}
+
+	/* set the dest status flags */
+	dest->flags |= IP_VS_DEST_F_AVAILABLE;
+
+	if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
+		dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
+	dest->u_threshold = udest->u_threshold;
+	dest->l_threshold = udest->l_threshold;
+}
+
+
+/*
+ *	Create a destination for the given service
+ */
+static int
+ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest,
+	       struct ip_vs_dest **dest_p)
+{
+	struct ip_vs_dest *dest;
+	unsigned atype;
+
+	EnterFunction(2);
+
+	atype = inet_addr_type(udest->addr);
+	if (atype != RTN_LOCAL && atype != RTN_UNICAST)
+		return -EINVAL;
+
+	dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC);
+	if (dest == NULL) {
+		IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n");
+		return -ENOMEM;
+	}
+	memset(dest, 0, sizeof(struct ip_vs_dest));
+
+	dest->protocol = svc->protocol;
+	dest->vaddr = svc->addr;
+	dest->vport = svc->port;
+	dest->vfwmark = svc->fwmark;
+	dest->addr = udest->addr;
+	dest->port = udest->port;
+
+	atomic_set(&dest->activeconns, 0);
+	atomic_set(&dest->inactconns, 0);
+	atomic_set(&dest->persistconns, 0);
+	atomic_set(&dest->refcnt, 0);
+
+	INIT_LIST_HEAD(&dest->d_list);
+	spin_lock_init(&dest->dst_lock);
+	spin_lock_init(&dest->stats.lock);
+	__ip_vs_update_dest(svc, dest, udest);
+	ip_vs_new_estimator(&dest->stats);
+
+	*dest_p = dest;
+
+	LeaveFunction(2);
+	return 0;
+}
+
+
+/*
+ *	Add a destination into an existing service
+ */
+static int
+ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+	struct ip_vs_dest *dest;
+	__u32 daddr = udest->addr;
+	__u16 dport = udest->port;
+	int ret;
+
+	EnterFunction(2);
+
+	if (udest->weight < 0) {
+		IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n");
+		return -ERANGE;
+	}
+
+	if (udest->l_threshold > udest->u_threshold) {
+		IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than "
+			  "upper threshold\n");
+		return -ERANGE;
+	}
+
+	/*
+	 * Check if the dest already exists in the list
+	 */
+	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	if (dest != NULL) {
+		IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n");
+		return -EEXIST;
+	}
+
+	/*
+	 * Check if the dest already exists in the trash and
+	 * is from the same service
+	 */
+	dest = ip_vs_trash_get_dest(svc, daddr, dport);
+	if (dest != NULL) {
+		IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, "
+			  "refcnt=%d, service %u/%u.%u.%u.%u:%u\n",
+			  NIPQUAD(daddr), ntohs(dport),
+			  atomic_read(&dest->refcnt),
+			  dest->vfwmark,
+			  NIPQUAD(dest->vaddr),
+			  ntohs(dest->vport));
+		__ip_vs_update_dest(svc, dest, udest);
+
+		/*
+		 * Get the destination from the trash
+		 */
+		list_del(&dest->n_list);
+
+		ip_vs_new_estimator(&dest->stats);
+
+		write_lock_bh(&__ip_vs_svc_lock);
+
+		/*
+		 * Wait until all other svc users go away.
+		 */
+		IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+		list_add(&dest->n_list, &svc->destinations);
+		svc->num_dests++;
+
+		/* call the update_service function of its scheduler */
+		svc->scheduler->update_service(svc);
+
+		write_unlock_bh(&__ip_vs_svc_lock);
+		return 0;
+	}
+
+	/*
+	 * Allocate and initialize the dest structure
+	 */
+	ret = ip_vs_new_dest(svc, udest, &dest);
+	if (ret) {
+		return ret;
+	}
+
+	/*
+	 * Add the dest entry into the list
+	 */
+	atomic_inc(&dest->refcnt);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 * Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	list_add(&dest->n_list, &svc->destinations);
+	svc->num_dests++;
+
+	/* call the update_service function of its scheduler */
+	svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Edit a destination in the given service
+ */
+static int
+ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest)
+{
+	struct ip_vs_dest *dest;
+	__u32 daddr = udest->addr;
+	__u16 dport = udest->port;
+
+	EnterFunction(2);
+
+	if (udest->weight < 0) {
+		IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n");
+		return -ERANGE;
+	}
+
+	if (udest->l_threshold > udest->u_threshold) {
+		IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than "
+			  "upper threshold\n");
+		return -ERANGE;
+	}
+
+	/*
+	 *  Lookup the destination list
+	 */
+	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	if (dest == NULL) {
+		IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n");
+		return -ENOENT;
+	}
+
+	__ip_vs_update_dest(svc, dest, udest);
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/* Wait until all other svc users go away */
+	while (atomic_read(&svc->usecnt) > 1) {};
+
+	/* call the update_service, because server weight may be changed */
+	svc->scheduler->update_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Delete a destination (must be already unlinked from the service)
+ */
+static void __ip_vs_del_dest(struct ip_vs_dest *dest)
+{
+	ip_vs_kill_estimator(&dest->stats);
+
+	/*
+	 *  Remove it from the d-linked list with the real services.
+	 */
+	write_lock_bh(&__ip_vs_rs_lock);
+	ip_vs_rs_unhash(dest);
+	write_unlock_bh(&__ip_vs_rs_lock);
+
+	/*
+	 *  Decrease the refcnt of the dest, and free the dest
+	 *  if nobody refers to it (refcnt=0). Otherwise, throw
+	 *  the destination into the trash.
+	 */
+	if (atomic_dec_and_test(&dest->refcnt)) {
+		ip_vs_dst_reset(dest);
+		/* simply decrease svc->refcnt here, let the caller check
+		   and release the service if nobody refers to it.
+		   Only user context can release destination and service,
+		   and only one user context can update virtual service at a
+		   time, so the operation here is OK */
+		atomic_dec(&dest->svc->refcnt);
+		kfree(dest);
+	} else {
+		IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n",
+			  NIPQUAD(dest->addr), ntohs(dest->port),
+			  atomic_read(&dest->refcnt));
+		list_add(&dest->n_list, &ip_vs_dest_trash);
+		atomic_inc(&dest->refcnt);
+	}
+}
+
+
+/*
+ *	Unlink a destination from the given service
+ */
+static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
+				struct ip_vs_dest *dest,
+				int svcupd)
+{
+	dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+
+	/*
+	 *  Remove it from the d-linked destination list.
+	 */
+	list_del(&dest->n_list);
+	svc->num_dests--;
+	if (svcupd) {
+		/*
+		 *  Call the update_service function of its scheduler
+		 */
+		svc->scheduler->update_service(svc);
+	}
+}
+
+
+/*
+ *	Delete a destination server in the given service
+ */
+static int
+ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest)
+{
+	struct ip_vs_dest *dest;
+	__u32 daddr = udest->addr;
+	__u16 dport = udest->port;
+
+	EnterFunction(2);
+
+	dest = ip_vs_lookup_dest(svc, daddr, dport);
+	if (dest == NULL) {
+		IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n");
+		return -ENOENT;
+	}
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 *	Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	/*
+	 *	Unlink dest from the service
+	 */
+	__ip_vs_unlink_dest(svc, dest, 1);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 *	Delete the destination
+	 */
+	__ip_vs_del_dest(dest);
+
+	LeaveFunction(2);
+
+	return 0;
+}
+
+
+/*
+ *	Add a service into the service hash table
+ */
+static int
+ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p)
+{
+	int ret = 0;
+	struct ip_vs_scheduler *sched = NULL;
+	struct ip_vs_service *svc = NULL;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	/* Lookup the scheduler by 'u->sched_name' */
+	sched = ip_vs_scheduler_get(u->sched_name);
+	if (sched == NULL) {
+		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+			   u->sched_name);
+		ret = -ENOENT;
+		goto out_mod_dec;
+	}
+
+	svc = (struct ip_vs_service *)
+		kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC);
+	if (svc == NULL) {
+		IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n");
+		ret = -ENOMEM;
+		goto out_err;
+	}
+	memset(svc, 0, sizeof(struct ip_vs_service));
+
+	/* I'm the first user of the service */
+	atomic_set(&svc->usecnt, 1);
+	atomic_set(&svc->refcnt, 0);
+
+	svc->protocol = u->protocol;
+	svc->addr = u->addr;
+	svc->port = u->port;
+	svc->fwmark = u->fwmark;
+	svc->flags = u->flags;
+	svc->timeout = u->timeout * HZ;
+	svc->netmask = u->netmask;
+
+	INIT_LIST_HEAD(&svc->destinations);
+	rwlock_init(&svc->sched_lock);
+	spin_lock_init(&svc->stats.lock);
+
+	/* Bind the scheduler */
+	ret = ip_vs_bind_scheduler(svc, sched);
+	if (ret)
+		goto out_err;
+	sched = NULL;
+
+	/* Update the virtual service counters */
+	if (svc->port == FTPPORT)
+		atomic_inc(&ip_vs_ftpsvc_counter);
+	else if (svc->port == 0)
+		atomic_inc(&ip_vs_nullsvc_counter);
+
+	ip_vs_new_estimator(&svc->stats);
+	ip_vs_num_services++;
+
+	/* Hash the service into the service table */
+	write_lock_bh(&__ip_vs_svc_lock);
+	ip_vs_svc_hash(svc);
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	*svc_p = svc;
+	return 0;
+
+  out_err:
+	if (svc != NULL) {
+		if (svc->scheduler)
+			ip_vs_unbind_scheduler(svc);
+		if (svc->inc) {
+			local_bh_disable();
+			ip_vs_app_inc_put(svc->inc);
+			local_bh_enable();
+		}
+		kfree(svc);
+	}
+	ip_vs_scheduler_put(sched);
+
+  out_mod_dec:
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return ret;
+}
+
+
+/*
+ *	Edit a service and bind it with a new scheduler
+ */
+static int
+ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u)
+{
+	struct ip_vs_scheduler *sched, *old_sched;
+	int ret = 0;
+
+	/*
+	 * Lookup the scheduler, by 'u->sched_name'
+	 */
+	sched = ip_vs_scheduler_get(u->sched_name);
+	if (sched == NULL) {
+		IP_VS_INFO("Scheduler module ip_vs_%s not found\n",
+			   u->sched_name);
+		return -ENOENT;
+	}
+	old_sched = sched;
+
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	/*
+	 * Wait until all other svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	/*
+	 * Set the flags and timeout value
+	 */
+	svc->flags = u->flags | IP_VS_SVC_F_HASHED;
+	svc->timeout = u->timeout * HZ;
+	svc->netmask = u->netmask;
+
+	old_sched = svc->scheduler;
+	if (sched != old_sched) {
+		/*
+		 * Unbind the old scheduler
+		 */
+		if ((ret = ip_vs_unbind_scheduler(svc))) {
+			old_sched = sched;
+			goto out;
+		}
+
+		/*
+		 * Bind the new scheduler
+		 */
+		if ((ret = ip_vs_bind_scheduler(svc, sched))) {
+			/*
+			 * If ip_vs_bind_scheduler fails, restore the old
+			 * scheduler.
+			 * The main reason of failure is out of memory.
+			 *
+			 * The question is if the old scheduler can be
+			 * restored all the time. TODO: if it cannot be
+			 * restored some time, we must delete the service,
+			 * otherwise the system may crash.
+			 */
+			ip_vs_bind_scheduler(svc, old_sched);
+			old_sched = sched;
+			goto out;
+		}
+	}
+
+  out:
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	if (old_sched)
+		ip_vs_scheduler_put(old_sched);
+
+	return ret;
+}
+
+
+/*
+ *	Delete a service from the service list
+ *	- The service must be unlinked, unlocked and not referenced!
+ *	- We are called under _bh lock
+ */
+static void __ip_vs_del_service(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest, *nxt;
+	struct ip_vs_scheduler *old_sched;
+
+	ip_vs_num_services--;
+	ip_vs_kill_estimator(&svc->stats);
+
+	/* Unbind scheduler */
+	old_sched = svc->scheduler;
+	ip_vs_unbind_scheduler(svc);
+	if (old_sched)
+		ip_vs_scheduler_put(old_sched);
+
+	/* Unbind app inc */
+	if (svc->inc) {
+		ip_vs_app_inc_put(svc->inc);
+		svc->inc = NULL;
+	}
+
+	/*
+	 *    Unlink the whole destination list
+	 */
+	list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
+		__ip_vs_unlink_dest(svc, dest, 0);
+		__ip_vs_del_dest(dest);
+	}
+
+	/*
+	 *    Update the virtual service counters
+	 */
+	if (svc->port == FTPPORT)
+		atomic_dec(&ip_vs_ftpsvc_counter);
+	else if (svc->port == 0)
+		atomic_dec(&ip_vs_nullsvc_counter);
+
+	/*
+	 *    Free the service if nobody refers to it
+	 */
+	if (atomic_read(&svc->refcnt) == 0)
+		kfree(svc);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+}
+
+/*
+ *	Delete a service from the service list
+ */
+static int ip_vs_del_service(struct ip_vs_service *svc)
+{
+	if (svc == NULL)
+		return -EEXIST;
+
+	/*
+	 * Unhash it from the service table
+	 */
+	write_lock_bh(&__ip_vs_svc_lock);
+
+	ip_vs_svc_unhash(svc);
+
+	/*
+	 * Wait until all the svc users go away.
+	 */
+	IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1);
+
+	__ip_vs_del_service(svc);
+
+	write_unlock_bh(&__ip_vs_svc_lock);
+
+	return 0;
+}
+
+
+/*
+ *	Flush all the virtual services
+ */
+static int ip_vs_flush(void)
+{
+	int idx;
+	struct ip_vs_service *svc, *nxt;
+
+	/*
+	 * Flush the service table hashed by <protocol,addr,port>
+	 */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) {
+			write_lock_bh(&__ip_vs_svc_lock);
+			ip_vs_svc_unhash(svc);
+			/*
+			 * Wait until all the svc users go away.
+			 */
+			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+			__ip_vs_del_service(svc);
+			write_unlock_bh(&__ip_vs_svc_lock);
+		}
+	}
+
+	/*
+	 * Flush the service table hashed by fwmark
+	 */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry_safe(svc, nxt,
+					 &ip_vs_svc_fwm_table[idx], f_list) {
+			write_lock_bh(&__ip_vs_svc_lock);
+			ip_vs_svc_unhash(svc);
+			/*
+			 * Wait until all the svc users go away.
+			 */
+			IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
+			__ip_vs_del_service(svc);
+			write_unlock_bh(&__ip_vs_svc_lock);
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ *	Zero counters in a service or all services
+ */
+static int ip_vs_zero_service(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+
+	write_lock_bh(&__ip_vs_svc_lock);
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		ip_vs_zero_stats(&dest->stats);
+	}
+	ip_vs_zero_stats(&svc->stats);
+	write_unlock_bh(&__ip_vs_svc_lock);
+	return 0;
+}
+
+static int ip_vs_zero_all(void)
+{
+	int idx;
+	struct ip_vs_service *svc;
+
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			ip_vs_zero_service(svc);
+		}
+	}
+
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			ip_vs_zero_service(svc);
+		}
+	}
+
+	ip_vs_zero_stats(&ip_vs_stats);
+	return 0;
+}
+
+
+static int
+proc_do_defense_mode(ctl_table *table, int write, struct file * filp,
+		     void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val = *valp;
+	int rc;
+
+	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (write && (*valp != val)) {
+		if ((*valp < 0) || (*valp > 3)) {
+			/* Restore the correct value */
+			*valp = val;
+		} else {
+			local_bh_disable();
+			update_defense_level();
+			local_bh_enable();
+		}
+	}
+	return rc;
+}
+
+
+static int
+proc_do_sync_threshold(ctl_table *table, int write, struct file *filp,
+		       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	int val[2];
+	int rc;
+
+	/* backup the value first */
+	memcpy(val, valp, sizeof(val));
+
+	rc = proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) {
+		/* Restore the correct value */
+		memcpy(valp, val, sizeof(val));
+	}
+	return rc;
+}
+
+
+/*
+ *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
+ */
+
+static struct ctl_table vs_vars[] = {
+	{
+		.ctl_name	= NET_IPV4_VS_AMEMTHRESH,
+		.procname	= "amemthresh",
+		.data		= &sysctl_ip_vs_amemthresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_IP_VS_DEBUG
+	{
+		.ctl_name	= NET_IPV4_VS_DEBUG_LEVEL,
+		.procname	= "debug_level",
+		.data		= &sysctl_ip_vs_debug_level,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
+		.ctl_name	= NET_IPV4_VS_AMDROPRATE,
+		.procname	= "am_droprate",
+		.data		= &sysctl_ip_vs_am_droprate,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_DROP_ENTRY,
+		.procname	= "drop_entry",
+		.data		= &sysctl_ip_vs_drop_entry,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_defense_mode,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_DROP_PACKET,
+		.procname	= "drop_packet",
+		.data		= &sysctl_ip_vs_drop_packet,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_defense_mode,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_SECURE_TCP,
+		.procname	= "secure_tcp",
+		.data		= &sysctl_ip_vs_secure_tcp,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_defense_mode,
+	},
+#if 0
+	{
+		.ctl_name	= NET_IPV4_VS_TO_ES,
+		.procname	= "timeout_established",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_SS,
+		.procname	= "timeout_synsent",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_SR,
+		.procname	= "timeout_synrecv",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_FW,
+		.procname	= "timeout_finwait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_TW,
+		.procname	= "timeout_timewait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_CL,
+		.procname	= "timeout_close",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_CW,
+		.procname	= "timeout_closewait",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_LA,
+		.procname	= "timeout_lastack",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_LI,
+		.procname	= "timeout_listen",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_SA,
+		.procname	= "timeout_synack",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_UDP,
+		.procname	= "timeout_udp",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_TO_ICMP,
+		.procname	= "timeout_icmp",
+		.data	= &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+#endif
+	{
+		.ctl_name	= NET_IPV4_VS_CACHE_BYPASS,
+		.procname	= "cache_bypass",
+		.data		= &sysctl_ip_vs_cache_bypass,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_EXPIRE_NODEST_CONN,
+		.procname	= "expire_nodest_conn",
+		.data		= &sysctl_ip_vs_expire_nodest_conn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE,
+		.procname	= "expire_quiescent_template",
+		.data		= &sysctl_ip_vs_expire_quiescent_template,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_SYNC_THRESHOLD,
+		.procname	= "sync_threshold",
+		.data		= &sysctl_ip_vs_sync_threshold,
+		.maxlen		= sizeof(sysctl_ip_vs_sync_threshold),
+		.mode		= 0644,
+		.proc_handler	= &proc_do_sync_threshold,
+	},
+	{
+		.ctl_name	= NET_IPV4_VS_NAT_ICMP_SEND,
+		.procname	= "nat_icmp_send",
+		.data		= &sysctl_ip_vs_nat_icmp_send,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table vs_table[] = {
+	{
+		.ctl_name	= NET_IPV4_VS,
+		.procname	= "vs",
+		.mode		= 0555,
+		.child		= vs_vars
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ipv4_table[] = {
+	{
+		.ctl_name	= NET_IPV4,
+		.procname	= "ipv4",
+		.mode		= 0555,
+		.child		= vs_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table vs_root_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555,
+		.child		= ipv4_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+#ifdef CONFIG_PROC_FS
+
+struct ip_vs_iter {
+	struct list_head *table;
+	int bucket;
+};
+
+/*
+ *	Write the contents of the VS rule table to a PROCfs file.
+ *	(It is kept just for backward compatibility)
+ */
+static inline const char *ip_vs_fwd_name(unsigned flags)
+{
+	switch (flags & IP_VS_CONN_F_FWD_MASK) {
+	case IP_VS_CONN_F_LOCALNODE:
+		return "Local";
+	case IP_VS_CONN_F_TUNNEL:
+		return "Tunnel";
+	case IP_VS_CONN_F_DROUTE:
+		return "Route";
+	default:
+		return "Masq";
+	}
+}
+
+
+/* Get the Nth entry in the two lists */
+static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
+{
+	struct ip_vs_iter *iter = seq->private;
+	int idx;
+	struct ip_vs_service *svc;
+
+	/* look in hash by protocol */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			if (pos-- == 0){
+				iter->table = ip_vs_svc_table;
+				iter->bucket = idx;
+				return svc;
+			}
+		}
+	}
+
+	/* keep looking in fwmark */
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			if (pos-- == 0) {
+				iter->table = ip_vs_svc_fwm_table;
+				iter->bucket = idx;
+				return svc;
+			}
+		}
+	}
+
+	return NULL;
+}
+
+static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
+{
+
+	read_lock_bh(&__ip_vs_svc_lock);
+	return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+
+static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct list_head *e;
+	struct ip_vs_iter *iter;
+	struct ip_vs_service *svc;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN)
+		return ip_vs_info_array(seq,0);
+
+	svc = v;
+	iter = seq->private;
+
+	if (iter->table == ip_vs_svc_table) {
+		/* next service in table hashed by protocol */
+		if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
+			return list_entry(e, struct ip_vs_service, s_list);
+
+
+		while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+			list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
+					    s_list) {
+				return svc;
+			}
+		}
+
+		iter->table = ip_vs_svc_fwm_table;
+		iter->bucket = -1;
+		goto scan_fwmark;
+	}
+
+	/* next service in hashed by fwmark */
+	if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
+		return list_entry(e, struct ip_vs_service, f_list);
+
+ scan_fwmark:
+	while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
+				    f_list)
+			return svc;
+	}
+
+	return NULL;
+}
+
+static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock_bh(&__ip_vs_svc_lock);
+}
+
+
+static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq,
+			"IP Virtual Server version %d.%d.%d (size=%d)\n",
+			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+		seq_puts(seq,
+			 "Prot LocalAddress:Port Scheduler Flags\n");
+		seq_puts(seq,
+			 "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
+	} else {
+		const struct ip_vs_service *svc = v;
+		const struct ip_vs_iter *iter = seq->private;
+		const struct ip_vs_dest *dest;
+
+		if (iter->table == ip_vs_svc_table)
+			seq_printf(seq, "%s  %08X:%04X %s ",
+				   ip_vs_proto_name(svc->protocol),
+				   ntohl(svc->addr),
+				   ntohs(svc->port),
+				   svc->scheduler->name);
+		else
+			seq_printf(seq, "FWM  %08X %s ",
+				   svc->fwmark, svc->scheduler->name);
+
+		if (svc->flags & IP_VS_SVC_F_PERSISTENT)
+			seq_printf(seq, "persistent %d %08X\n",
+				svc->timeout,
+				ntohl(svc->netmask));
+		else
+			seq_putc(seq, '\n');
+
+		list_for_each_entry(dest, &svc->destinations, n_list) {
+			seq_printf(seq,
+				   "  -> %08X:%04X      %-7s %-6d %-10d %-10d\n",
+				   ntohl(dest->addr), ntohs(dest->port),
+				   ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
+				   atomic_read(&dest->weight),
+				   atomic_read(&dest->activeconns),
+				   atomic_read(&dest->inactconns));
+		}
+	}
+	return 0;
+}
+
+static struct seq_operations ip_vs_info_seq_ops = {
+	.start = ip_vs_info_seq_start,
+	.next  = ip_vs_info_seq_next,
+	.stop  = ip_vs_info_seq_stop,
+	.show  = ip_vs_info_seq_show,
+};
+
+static int ip_vs_info_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		goto out;
+
+	rc = seq_open(file, &ip_vs_info_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq	     = file->private_data;
+	seq->private = s;
+	memset(s, 0, sizeof(*s));
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations ip_vs_info_fops = {
+	.owner	 = THIS_MODULE,
+	.open    = ip_vs_info_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+
+#endif
+
+struct ip_vs_stats ip_vs_stats;
+
+#ifdef CONFIG_PROC_FS
+static int ip_vs_stats_show(struct seq_file *seq, void *v)
+{
+
+/*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		 "   Total Incoming Outgoing         Incoming         Outgoing\n");
+	seq_printf(seq,
+		   "   Conns  Packets  Packets            Bytes            Bytes\n");
+
+	spin_lock_bh(&ip_vs_stats.lock);
+	seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns,
+		   ip_vs_stats.inpkts, ip_vs_stats.outpkts,
+		   (unsigned long long) ip_vs_stats.inbytes,
+		   (unsigned long long) ip_vs_stats.outbytes);
+
+/*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
+	seq_puts(seq,
+		   " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
+	seq_printf(seq,"%8X %8X %8X %16X %16X\n",
+			ip_vs_stats.cps,
+			ip_vs_stats.inpps,
+			ip_vs_stats.outpps,
+			ip_vs_stats.inbps,
+			ip_vs_stats.outbps);
+	spin_unlock_bh(&ip_vs_stats.lock);
+
+	return 0;
+}
+
+static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ip_vs_stats_show, NULL);
+}
+
+static struct file_operations ip_vs_stats_fops = {
+	.owner = THIS_MODULE,
+	.open = ip_vs_stats_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+#endif
+
+/*
+ *	Set timeout values for tcp tcpfin udp in the timeout_table.
+ */
+static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
+{
+	IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
+		  u->tcp_timeout,
+		  u->tcp_fin_timeout,
+		  u->udp_timeout);
+
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	if (u->tcp_timeout) {
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED]
+			= u->tcp_timeout * HZ;
+	}
+
+	if (u->tcp_fin_timeout) {
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT]
+			= u->tcp_fin_timeout * HZ;
+	}
+#endif
+
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	if (u->udp_timeout) {
+		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL]
+			= u->udp_timeout * HZ;
+	}
+#endif
+	return 0;
+}
+
+
+#define SET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
+#define SERVICE_ARG_LEN		(sizeof(struct ip_vs_service_user))
+#define SVCDEST_ARG_LEN		(sizeof(struct ip_vs_service_user) +	\
+				 sizeof(struct ip_vs_dest_user))
+#define TIMEOUT_ARG_LEN		(sizeof(struct ip_vs_timeout_user))
+#define DAEMON_ARG_LEN		(sizeof(struct ip_vs_daemon_user))
+#define MAX_ARG_LEN		SVCDEST_ARG_LEN
+
+static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
+	[SET_CMDID(IP_VS_SO_SET_ADD)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_EDIT)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_DEL)]		= SERVICE_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_FLUSH)]		= 0,
+	[SET_CMDID(IP_VS_SO_SET_ADDDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_DELDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_EDITDEST)]	= SVCDEST_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_TIMEOUT)]	= TIMEOUT_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]	= DAEMON_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]	= DAEMON_ARG_LEN,
+	[SET_CMDID(IP_VS_SO_SET_ZERO)]		= SERVICE_ARG_LEN,
+};
+
+static int
+do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+	unsigned char arg[MAX_ARG_LEN];
+	struct ip_vs_service_user *usvc;
+	struct ip_vs_service *svc;
+	struct ip_vs_dest_user *udest;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (len != set_arglen[SET_CMDID(cmd)]) {
+		IP_VS_ERR("set_ctl: len %u != %u\n",
+			  len, set_arglen[SET_CMDID(cmd)]);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(arg, user, len) != 0)
+		return -EFAULT;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	if (down_interruptible(&__ip_vs_mutex)) {
+		ret = -ERESTARTSYS;
+		goto out_dec;
+	}
+
+	if (cmd == IP_VS_SO_SET_FLUSH) {
+		/* Flush the virtual service */
+		ret = ip_vs_flush();
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_TIMEOUT) {
+		/* Set timeout values for (tcp tcpfin udp) */
+		ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
+		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+		ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid);
+		goto out_unlock;
+	} else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
+		struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
+		ret = stop_sync_thread(dm->state);
+		goto out_unlock;
+	}
+
+	usvc = (struct ip_vs_service_user *)arg;
+	udest = (struct ip_vs_dest_user *)(usvc + 1);
+
+	if (cmd == IP_VS_SO_SET_ZERO) {
+		/* if no service address is set, zero counters in all */
+		if (!usvc->fwmark && !usvc->addr && !usvc->port) {
+			ret = ip_vs_zero_all();
+			goto out_unlock;
+		}
+	}
+
+	/* Check for valid protocol: TCP or UDP, even for fwmark!=0 */
+	if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) {
+		IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n",
+			  usvc->protocol, NIPQUAD(usvc->addr),
+			  ntohs(usvc->port), usvc->sched_name);
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	/* Lookup the exact service by <protocol, addr, port> or fwmark */
+	if (usvc->fwmark == 0)
+		svc = __ip_vs_service_get(usvc->protocol,
+					  usvc->addr, usvc->port);
+	else
+		svc = __ip_vs_svc_fwm_get(usvc->fwmark);
+
+	if (cmd != IP_VS_SO_SET_ADD
+	    && (svc == NULL || svc->protocol != usvc->protocol)) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+
+	switch (cmd) {
+	case IP_VS_SO_SET_ADD:
+		if (svc != NULL)
+			ret = -EEXIST;
+		else
+			ret = ip_vs_add_service(usvc, &svc);
+		break;
+	case IP_VS_SO_SET_EDIT:
+		ret = ip_vs_edit_service(svc, usvc);
+		break;
+	case IP_VS_SO_SET_DEL:
+		ret = ip_vs_del_service(svc);
+		if (!ret)
+			goto out_unlock;
+		break;
+	case IP_VS_SO_SET_ZERO:
+		ret = ip_vs_zero_service(svc);
+		break;
+	case IP_VS_SO_SET_ADDDEST:
+		ret = ip_vs_add_dest(svc, udest);
+		break;
+	case IP_VS_SO_SET_EDITDEST:
+		ret = ip_vs_edit_dest(svc, udest);
+		break;
+	case IP_VS_SO_SET_DELDEST:
+		ret = ip_vs_del_dest(svc, udest);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (svc)
+		ip_vs_service_put(svc);
+
+  out_unlock:
+	up(&__ip_vs_mutex);
+  out_dec:
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	return ret;
+}
+
+
+static void
+ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
+{
+	spin_lock_bh(&src->lock);
+	memcpy(dst, src, (char*)&src->lock - (char*)src);
+	spin_unlock_bh(&src->lock);
+}
+
+static void
+ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
+{
+	dst->protocol = src->protocol;
+	dst->addr = src->addr;
+	dst->port = src->port;
+	dst->fwmark = src->fwmark;
+	strcpy(dst->sched_name, src->scheduler->name);
+	dst->flags = src->flags;
+	dst->timeout = src->timeout / HZ;
+	dst->netmask = src->netmask;
+	dst->num_dests = src->num_dests;
+	ip_vs_copy_stats(&dst->stats, &src->stats);
+}
+
+static inline int
+__ip_vs_get_service_entries(const struct ip_vs_get_services *get,
+			    struct ip_vs_get_services __user *uptr)
+{
+	int idx, count=0;
+	struct ip_vs_service *svc;
+	struct ip_vs_service_entry entry;
+	int ret = 0;
+
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
+			if (count >= get->num_services)
+				goto out;
+			ip_vs_copy_service(&entry, svc);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			count++;
+		}
+	}
+
+	for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
+		list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
+			if (count >= get->num_services)
+				goto out;
+			ip_vs_copy_service(&entry, svc);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				goto out;
+			}
+			count++;
+		}
+	}
+  out:
+	return ret;
+}
+
+static inline int
+__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
+			 struct ip_vs_get_dests __user *uptr)
+{
+	struct ip_vs_service *svc;
+	int ret = 0;
+
+	if (get->fwmark)
+		svc = __ip_vs_svc_fwm_get(get->fwmark);
+	else
+		svc = __ip_vs_service_get(get->protocol,
+					  get->addr, get->port);
+	if (svc) {
+		int count = 0;
+		struct ip_vs_dest *dest;
+		struct ip_vs_dest_entry entry;
+
+		list_for_each_entry(dest, &svc->destinations, n_list) {
+			if (count >= get->num_dests)
+				break;
+
+			entry.addr = dest->addr;
+			entry.port = dest->port;
+			entry.conn_flags = atomic_read(&dest->conn_flags);
+			entry.weight = atomic_read(&dest->weight);
+			entry.u_threshold = dest->u_threshold;
+			entry.l_threshold = dest->l_threshold;
+			entry.activeconns = atomic_read(&dest->activeconns);
+			entry.inactconns = atomic_read(&dest->inactconns);
+			entry.persistconns = atomic_read(&dest->persistconns);
+			ip_vs_copy_stats(&entry.stats, &dest->stats);
+			if (copy_to_user(&uptr->entrytable[count],
+					 &entry, sizeof(entry))) {
+				ret = -EFAULT;
+				break;
+			}
+			count++;
+		}
+		ip_vs_service_put(svc);
+	} else
+		ret = -ESRCH;
+	return ret;
+}
+
+static inline void
+__ip_vs_get_timeouts(struct ip_vs_timeout_user *u)
+{
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	u->tcp_timeout =
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
+	u->tcp_fin_timeout =
+		ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	u->udp_timeout =
+		ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
+#endif
+}
+
+
+#define GET_CMDID(cmd)		(cmd - IP_VS_BASE_CTL)
+#define GET_INFO_ARG_LEN	(sizeof(struct ip_vs_getinfo))
+#define GET_SERVICES_ARG_LEN	(sizeof(struct ip_vs_get_services))
+#define GET_SERVICE_ARG_LEN	(sizeof(struct ip_vs_service_entry))
+#define GET_DESTS_ARG_LEN	(sizeof(struct ip_vs_get_dests))
+#define GET_TIMEOUT_ARG_LEN	(sizeof(struct ip_vs_timeout_user))
+#define GET_DAEMON_ARG_LEN	(sizeof(struct ip_vs_daemon_user) * 2)
+
+static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
+	[GET_CMDID(IP_VS_SO_GET_VERSION)]	= 64,
+	[GET_CMDID(IP_VS_SO_GET_INFO)]		= GET_INFO_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_SERVICES)]	= GET_SERVICES_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_SERVICE)]	= GET_SERVICE_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_DESTS)]		= GET_DESTS_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_TIMEOUT)]	= GET_TIMEOUT_ARG_LEN,
+	[GET_CMDID(IP_VS_SO_GET_DAEMON)]	= GET_DAEMON_ARG_LEN,
+};
+
+static int
+do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	unsigned char arg[128];
+	int ret = 0;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	if (*len < get_arglen[GET_CMDID(cmd)]) {
+		IP_VS_ERR("get_ctl: len %u < %u\n",
+			  *len, get_arglen[GET_CMDID(cmd)]);
+		return -EINVAL;
+	}
+
+	if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0)
+		return -EFAULT;
+
+	if (down_interruptible(&__ip_vs_mutex))
+		return -ERESTARTSYS;
+
+	switch (cmd) {
+	case IP_VS_SO_GET_VERSION:
+	{
+		char buf[64];
+
+		sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
+			NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE);
+		if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
+			ret = -EFAULT;
+			goto out;
+		}
+		*len = strlen(buf)+1;
+	}
+	break;
+
+	case IP_VS_SO_GET_INFO:
+	{
+		struct ip_vs_getinfo info;
+		info.version = IP_VS_VERSION_CODE;
+		info.size = IP_VS_CONN_TAB_SIZE;
+		info.num_services = ip_vs_num_services;
+		if (copy_to_user(user, &info, sizeof(info)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	case IP_VS_SO_GET_SERVICES:
+	{
+		struct ip_vs_get_services *get;
+		int size;
+
+		get = (struct ip_vs_get_services *)arg;
+		size = sizeof(*get) +
+			sizeof(struct ip_vs_service_entry) * get->num_services;
+		if (*len != size) {
+			IP_VS_ERR("length: %u != %u\n", *len, size);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __ip_vs_get_service_entries(get, user);
+	}
+	break;
+
+	case IP_VS_SO_GET_SERVICE:
+	{
+		struct ip_vs_service_entry *entry;
+		struct ip_vs_service *svc;
+
+		entry = (struct ip_vs_service_entry *)arg;
+		if (entry->fwmark)
+			svc = __ip_vs_svc_fwm_get(entry->fwmark);
+		else
+			svc = __ip_vs_service_get(entry->protocol,
+						  entry->addr, entry->port);
+		if (svc) {
+			ip_vs_copy_service(entry, svc);
+			if (copy_to_user(user, entry, sizeof(*entry)) != 0)
+				ret = -EFAULT;
+			ip_vs_service_put(svc);
+		} else
+			ret = -ESRCH;
+	}
+	break;
+
+	case IP_VS_SO_GET_DESTS:
+	{
+		struct ip_vs_get_dests *get;
+		int size;
+
+		get = (struct ip_vs_get_dests *)arg;
+		size = sizeof(*get) +
+			sizeof(struct ip_vs_dest_entry) * get->num_dests;
+		if (*len != size) {
+			IP_VS_ERR("length: %u != %u\n", *len, size);
+			ret = -EINVAL;
+			goto out;
+		}
+		ret = __ip_vs_get_dest_entries(get, user);
+	}
+	break;
+
+	case IP_VS_SO_GET_TIMEOUT:
+	{
+		struct ip_vs_timeout_user t;
+
+		__ip_vs_get_timeouts(&t);
+		if (copy_to_user(user, &t, sizeof(t)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	case IP_VS_SO_GET_DAEMON:
+	{
+		struct ip_vs_daemon_user d[2];
+
+		memset(&d, 0, sizeof(d));
+		if (ip_vs_sync_state & IP_VS_STATE_MASTER) {
+			d[0].state = IP_VS_STATE_MASTER;
+			strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn);
+			d[0].syncid = ip_vs_master_syncid;
+		}
+		if (ip_vs_sync_state & IP_VS_STATE_BACKUP) {
+			d[1].state = IP_VS_STATE_BACKUP;
+			strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn);
+			d[1].syncid = ip_vs_backup_syncid;
+		}
+		if (copy_to_user(user, &d, sizeof(d)) != 0)
+			ret = -EFAULT;
+	}
+	break;
+
+	default:
+		ret = -EINVAL;
+	}
+
+  out:
+	up(&__ip_vs_mutex);
+	return ret;
+}
+
+
+static struct nf_sockopt_ops ip_vs_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= IP_VS_BASE_CTL,
+	.set_optmax	= IP_VS_SO_SET_MAX+1,
+	.set		= do_ip_vs_set_ctl,
+	.get_optmin	= IP_VS_BASE_CTL,
+	.get_optmax	= IP_VS_SO_GET_MAX+1,
+	.get		= do_ip_vs_get_ctl,
+};
+
+
+int ip_vs_control_init(void)
+{
+	int ret;
+	int idx;
+
+	EnterFunction(2);
+
+	ret = nf_register_sockopt(&ip_vs_sockopts);
+	if (ret) {
+		IP_VS_ERR("cannot register sockopt.\n");
+		return ret;
+	}
+
+	proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops);
+	proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops);
+
+	sysctl_header = register_sysctl_table(vs_root_table, 0);
+
+	/* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */
+	for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
+		INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
+	}
+	for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++)  {
+		INIT_LIST_HEAD(&ip_vs_rtable[idx]);
+	}
+
+	memset(&ip_vs_stats, 0, sizeof(ip_vs_stats));
+	spin_lock_init(&ip_vs_stats.lock);
+	ip_vs_new_estimator(&ip_vs_stats);
+
+	/* Hook the defense timer */
+	schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
+
+	LeaveFunction(2);
+	return 0;
+}
+
+
+void ip_vs_control_cleanup(void)
+{
+	EnterFunction(2);
+	ip_vs_trash_cleanup();
+	cancel_rearming_delayed_work(&defense_work);
+	ip_vs_kill_estimator(&ip_vs_stats);
+	unregister_sysctl_table(sysctl_header);
+	proc_net_remove("ip_vs_stats");
+	proc_net_remove("ip_vs");
+	nf_unregister_sockopt(&ip_vs_sockopts);
+	LeaveFunction(2);
+}
diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c
new file mode 100644
index 000000000000..f3bc320dce93
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_dh.c
@@ -0,0 +1,258 @@
+/*
+ * IPVS:        Destination Hashing scheduling module
+ *
+ * Version:     $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              Inspired by the consistent hashing scheduler patch from
+ *              Thomas Proell <proellt@gmx.de>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The dh algorithm is to select server by the hash key of destination IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[dest_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) OR (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet destination IP address to the current server
+ * array. If the dh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS DH bucket
+ */
+struct ip_vs_dh_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS DH entry hash table
+ */
+#ifndef CONFIG_IP_VS_DH_TAB_BITS
+#define CONFIG_IP_VS_DH_TAB_BITS        8
+#endif
+#define IP_VS_DH_TAB_BITS               CONFIG_IP_VS_DH_TAB_BITS
+#define IP_VS_DH_TAB_SIZE               (1 << IP_VS_DH_TAB_BITS)
+#define IP_VS_DH_TAB_MASK               (IP_VS_DH_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS DH entry
+ */
+static inline unsigned ip_vs_dh_hashkey(__u32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr)
+{
+	return (tbl[ip_vs_dh_hashkey(addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_dh_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl)
+{
+	int i;
+	struct ip_vs_dh_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_DH_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_dh_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl;
+
+	/* allocate the DH table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_dh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_dh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_dh_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_dh_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_dh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_dh_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_dh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Destination hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_dh_bucket *tbl;
+	struct iphdr *iph = skb->nh.iph;
+
+	IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_dh_bucket *)svc->sched_data;
+	dest = ip_vs_dh_get(tbl, iph->daddr);
+	if (!dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		return NULL;
+	}
+
+	IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->daddr),
+		  NIPQUAD(dest->addr),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS DH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_dh_scheduler =
+{
+	.name =			"dh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_dh_init_svc,
+	.done_service =		ip_vs_dh_done_svc,
+	.update_service =	ip_vs_dh_update_svc,
+	.schedule =		ip_vs_dh_schedule,
+};
+
+
+static int __init ip_vs_dh_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+static void __exit ip_vs_dh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_dh_scheduler);
+}
+
+
+module_init(ip_vs_dh_init);
+module_exit(ip_vs_dh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c
new file mode 100644
index 000000000000..67b3e2fc1fa1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_est.c
@@ -0,0 +1,200 @@
+/*
+ * ip_vs_est.c: simple rate estimator for IPVS
+ *
+ * Version:     $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include <net/ip_vs.h>
+
+/*
+  This code is to estimate rate in a shorter interval (such as 8
+  seconds) for virtual services and real servers. For measure rate in a
+  long interval, it is easy to implement a user level daemon which
+  periodically reads those statistical counters and measure rate.
+
+  Currently, the measurement is activated by slow timer handler. Hope
+  this measurement will not introduce too much load.
+
+  We measure rate during the last 8 seconds every 2 seconds:
+
+    avgrate = avgrate*(1-W) + rate*W
+
+    where W = 2^(-2)
+
+  NOTES.
+
+  * The stored value for average bps is scaled by 2^5, so that maximal
+    rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10.
+
+  * A lot code is taken from net/sched/estimator.c
+ */
+
+
+struct ip_vs_estimator
+{
+	struct ip_vs_estimator	*next;
+	struct ip_vs_stats	*stats;
+
+	u32			last_conns;
+	u32			last_inpkts;
+	u32			last_outpkts;
+	u64			last_inbytes;
+	u64			last_outbytes;
+
+	u32			cps;
+	u32			inpps;
+	u32			outpps;
+	u32			inbps;
+	u32			outbps;
+};
+
+
+static struct ip_vs_estimator *est_list = NULL;
+static DEFINE_RWLOCK(est_lock);
+static struct timer_list est_timer;
+
+static void estimation_timer(unsigned long arg)
+{
+	struct ip_vs_estimator *e;
+	struct ip_vs_stats *s;
+	u32 n_conns;
+	u32 n_inpkts, n_outpkts;
+	u64 n_inbytes, n_outbytes;
+	u32 rate;
+
+	read_lock(&est_lock);
+	for (e = est_list; e; e = e->next) {
+		s = e->stats;
+
+		spin_lock(&s->lock);
+		n_conns = s->conns;
+		n_inpkts = s->inpkts;
+		n_outpkts = s->outpkts;
+		n_inbytes = s->inbytes;
+		n_outbytes = s->outbytes;
+
+		/* scaled by 2^10, but divided 2 seconds */
+		rate = (n_conns - e->last_conns)<<9;
+		e->last_conns = n_conns;
+		e->cps += ((long)rate - (long)e->cps)>>2;
+		s->cps = (e->cps+0x1FF)>>10;
+
+		rate = (n_inpkts - e->last_inpkts)<<9;
+		e->last_inpkts = n_inpkts;
+		e->inpps += ((long)rate - (long)e->inpps)>>2;
+		s->inpps = (e->inpps+0x1FF)>>10;
+
+		rate = (n_outpkts - e->last_outpkts)<<9;
+		e->last_outpkts = n_outpkts;
+		e->outpps += ((long)rate - (long)e->outpps)>>2;
+		s->outpps = (e->outpps+0x1FF)>>10;
+
+		rate = (n_inbytes - e->last_inbytes)<<4;
+		e->last_inbytes = n_inbytes;
+		e->inbps += ((long)rate - (long)e->inbps)>>2;
+		s->inbps = (e->inbps+0xF)>>5;
+
+		rate = (n_outbytes - e->last_outbytes)<<4;
+		e->last_outbytes = n_outbytes;
+		e->outbps += ((long)rate - (long)e->outbps)>>2;
+		s->outbps = (e->outbps+0xF)>>5;
+		spin_unlock(&s->lock);
+	}
+	read_unlock(&est_lock);
+	mod_timer(&est_timer, jiffies + 2*HZ);
+}
+
+int ip_vs_new_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est;
+
+	est = kmalloc(sizeof(*est), GFP_KERNEL);
+	if (est == NULL)
+		return -ENOMEM;
+
+	memset(est, 0, sizeof(*est));
+	est->stats = stats;
+	est->last_conns = stats->conns;
+	est->cps = stats->cps<<10;
+
+	est->last_inpkts = stats->inpkts;
+	est->inpps = stats->inpps<<10;
+
+	est->last_outpkts = stats->outpkts;
+	est->outpps = stats->outpps<<10;
+
+	est->last_inbytes = stats->inbytes;
+	est->inbps = stats->inbps<<5;
+
+	est->last_outbytes = stats->outbytes;
+	est->outbps = stats->outbps<<5;
+
+	write_lock_bh(&est_lock);
+	est->next = est_list;
+	if (est->next == NULL) {
+		init_timer(&est_timer);
+		est_timer.expires = jiffies + 2*HZ;
+		est_timer.function = estimation_timer;
+		add_timer(&est_timer);
+	}
+	est_list = est;
+	write_unlock_bh(&est_lock);
+	return 0;
+}
+
+void ip_vs_kill_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est, **pest;
+	int killed = 0;
+
+	write_lock_bh(&est_lock);
+	pest = &est_list;
+	while ((est=*pest) != NULL) {
+		if (est->stats != stats) {
+			pest = &est->next;
+			continue;
+		}
+		*pest = est->next;
+		kfree(est);
+		killed++;
+	}
+	if (killed && est_list == NULL)
+		del_timer_sync(&est_timer);
+	write_unlock_bh(&est_lock);
+}
+
+void ip_vs_zero_estimator(struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *e;
+
+	write_lock_bh(&est_lock);
+	for (e = est_list; e; e = e->next) {
+		if (e->stats != stats)
+			continue;
+
+		/* set counters zero */
+		e->last_conns = 0;
+		e->last_inpkts = 0;
+		e->last_outpkts = 0;
+		e->last_inbytes = 0;
+		e->last_outbytes = 0;
+		e->cps = 0;
+		e->inpps = 0;
+		e->outpps = 0;
+		e->inbps = 0;
+		e->outbps = 0;
+	}
+	write_unlock_bh(&est_lock);
+}
diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c
new file mode 100644
index 000000000000..a19a33ceb811
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_ftp.c
@@ -0,0 +1,400 @@
+/*
+ * ip_vs_ftp.c: IPVS ftp application module
+ *
+ * Version:	$Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:	Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * Changes:
+ *
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference
+ * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp.
+ *
+ *		IP_MASQ_FTP ftp masquerading module
+ *
+ * Version:	@(#)ip_masq_ftp.c 0.04   02/05/96
+ *
+ * Author:	Wouter Gadeyne
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+
+#include <net/ip_vs.h>
+
+
+#define SERVER_STRING "227 Entering Passive Mode ("
+#define CLIENT_STRING "PORT "
+
+
+/*
+ * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
+ * First port is set to the default port.
+ */
+static int ports[IP_VS_APP_MAX_PORTS] = {21, 0};
+module_param_array(ports, int, NULL, 0);
+
+/*
+ *	Debug level
+ */
+#ifdef CONFIG_IP_VS_DEBUG
+static int debug=0;
+module_param(debug, int, 0);
+#endif
+
+
+/*	Dummy variable */
+static int ip_vs_ftp_pasv;
+
+
+static int
+ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
+{
+	return 0;
+}
+
+
+/*
+ * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern" and terminated with the "term" character.
+ * <addr,port> is in network order.
+ */
+static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
+				  const char *pattern, size_t plen, char term,
+				  __u32 *addr, __u16 *port,
+				  char **start, char **end)
+{
+	unsigned char p[6];
+	int i = 0;
+
+	if (data_limit - data < plen) {
+		/* check if there is partial match */
+		if (strnicmp(data, pattern, data_limit - data) == 0)
+			return -1;
+		else
+			return 0;
+	}
+
+	if (strnicmp(data, pattern, plen) != 0) {
+		return 0;
+	}
+	*start = data + plen;
+
+	for (data = *start; *data != term; data++) {
+		if (data == data_limit)
+			return -1;
+	}
+	*end = data;
+
+	memset(p, 0, sizeof(p));
+	for (data = *start; data != *end; data++) {
+		if (*data >= '0' && *data <= '9') {
+			p[i] = p[i]*10 + *data - '0';
+		} else if (*data == ',' && i < 5) {
+			i++;
+		} else {
+			/* unexpected character */
+			return -1;
+		}
+	}
+
+	if (i != 5)
+		return -1;
+
+	*addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
+	*port = (p[5]<<8) | p[4];
+	return 1;
+}
+
+
+/*
+ * Look at outgoing ftp packets to catch the response to a PASV command
+ * from the server (inside-to-outside).
+ * When we see one, we build a connection entry with the client address,
+ * client port 0 (unknown at the moment), the server address and the
+ * server port.  Mark the current connection entry as a control channel
+ * of the new entry. All this work is just to make the data connection
+ * can be scheduled to the right server later.
+ *
+ * The outgoing packet should be something like
+ *   "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
+ * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ */
+static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
+			 struct sk_buff **pskb, int *diff)
+{
+	struct iphdr *iph;
+	struct tcphdr *th;
+	char *data, *data_limit;
+	char *start, *end;
+	__u32 from;
+	__u16 port;
+	struct ip_vs_conn *n_cp;
+	char buf[24];		/* xxx.xxx.xxx.xxx,ppp,ppp\000 */
+	unsigned buf_len;
+	int ret;
+
+	*diff = 0;
+
+	/* Only useful for established sessions */
+	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+		return 1;
+
+	/* Linear packets are much easier to deal with. */
+	if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+		return 0;
+
+	if (cp->app_data == &ip_vs_ftp_pasv) {
+		iph = (*pskb)->nh.iph;
+		th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+		data = (char *)th + (th->doff << 2);
+		data_limit = (*pskb)->tail;
+
+		if (ip_vs_ftp_get_addrport(data, data_limit,
+					   SERVER_STRING,
+					   sizeof(SERVER_STRING)-1, ')',
+					   &from, &port,
+					   &start, &end) != 1)
+			return 1;
+
+		IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
+			  "%u.%u.%u.%u:%d detected\n",
+			  NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0);
+
+		/*
+		 * Now update or create an connection entry for it
+		 */
+		n_cp = ip_vs_conn_out_get(iph->protocol, from, port,
+					  cp->caddr, 0);
+		if (!n_cp) {
+			n_cp = ip_vs_conn_new(IPPROTO_TCP,
+					      cp->caddr, 0,
+					      cp->vaddr, port,
+					      from, port,
+					      IP_VS_CONN_F_NO_CPORT,
+					      cp->dest);
+			if (!n_cp)
+				return 0;
+
+			/* add its controller */
+			ip_vs_control_add(n_cp, cp);
+		}
+
+		/*
+		 * Replace the old passive address with the new one
+		 */
+		from = n_cp->vaddr;
+		port = n_cp->vport;
+		sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
+			port&255, (port>>8)&255);
+		buf_len = strlen(buf);
+
+		/*
+		 * Calculate required delta-offset to keep TCP happy
+		 */
+		*diff = buf_len - (end-start);
+
+		if (*diff == 0) {
+			/* simply replace it with new passive address */
+			memcpy(start, buf, buf_len);
+			ret = 1;
+		} else {
+			ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
+					  end-start, buf, buf_len);
+		}
+
+		cp->app_data = NULL;
+		ip_vs_tcp_conn_listen(n_cp);
+		ip_vs_conn_put(n_cp);
+		return ret;
+	}
+	return 1;
+}
+
+
+/*
+ * Look at incoming ftp packets to catch the PASV/PORT command
+ * (outside-to-inside).
+ *
+ * The incoming packet having the PORT command should be something like
+ *      "PORT xxx,xxx,xxx,xxx,ppp,ppp\n".
+ * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number.
+ * In this case, we create a connection entry using the client address and
+ * port, so that the active ftp data connection from the server can reach
+ * the client.
+ */
+static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
+			struct sk_buff **pskb, int *diff)
+{
+	struct iphdr *iph;
+	struct tcphdr *th;
+	char *data, *data_start, *data_limit;
+	char *start, *end;
+	__u32 to;
+	__u16 port;
+	struct ip_vs_conn *n_cp;
+
+	/* no diff required for incoming packets */
+	*diff = 0;
+
+	/* Only useful for established sessions */
+	if (cp->state != IP_VS_TCP_S_ESTABLISHED)
+		return 1;
+
+	/* Linear packets are much easier to deal with. */
+	if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
+		return 0;
+
+	/*
+	 * Detecting whether it is passive
+	 */
+	iph = (*pskb)->nh.iph;
+	th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
+
+	/* Since there may be OPTIONS in the TCP packet and the HLEN is
+	   the length of the header in 32-bit multiples, it is accurate
+	   to calculate data address by th+HLEN*4 */
+	data = data_start = (char *)th + (th->doff << 2);
+	data_limit = (*pskb)->tail;
+
+	while (data <= data_limit - 6) {
+		if (strnicmp(data, "PASV\r\n", 6) == 0) {
+			/* Passive mode on */
+			IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n",
+				  data - data_start,
+				  data_limit - data_start);
+			cp->app_data = &ip_vs_ftp_pasv;
+			return 1;
+		}
+		data++;
+	}
+
+	/*
+	 * To support virtual FTP server, the scenerio is as follows:
+	 *       FTP client ----> Load Balancer ----> FTP server
+	 * First detect the port number in the application data,
+	 * then create a new connection entry for the coming data
+	 * connection.
+	 */
+	if (ip_vs_ftp_get_addrport(data_start, data_limit,
+				   CLIENT_STRING, sizeof(CLIENT_STRING)-1,
+				   '\r', &to, &port,
+				   &start, &end) != 1)
+		return 1;
+
+	IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
+		  NIPQUAD(to), ntohs(port));
+
+	/* Passive mode off */
+	cp->app_data = NULL;
+
+	/*
+	 * Now update or create a connection entry for it
+	 */
+	IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
+		  ip_vs_proto_name(iph->protocol),
+		  NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
+
+	n_cp = ip_vs_conn_in_get(iph->protocol,
+				 to, port,
+				 cp->vaddr, htons(ntohs(cp->vport)-1));
+	if (!n_cp) {
+		n_cp = ip_vs_conn_new(IPPROTO_TCP,
+				      to, port,
+				      cp->vaddr, htons(ntohs(cp->vport)-1),
+				      cp->daddr, htons(ntohs(cp->dport)-1),
+				      0,
+				      cp->dest);
+		if (!n_cp)
+			return 0;
+
+		/* add its controller */
+		ip_vs_control_add(n_cp, cp);
+	}
+
+	/*
+	 *	Move tunnel to listen state
+	 */
+	ip_vs_tcp_conn_listen(n_cp);
+	ip_vs_conn_put(n_cp);
+
+	return 1;
+}
+
+
+static struct ip_vs_app ip_vs_ftp = {
+	.name =		"ftp",
+	.type =		IP_VS_APP_TYPE_FTP,
+	.protocol =	IPPROTO_TCP,
+	.module =	THIS_MODULE,
+	.incs_list =	LIST_HEAD_INIT(ip_vs_ftp.incs_list),
+	.init_conn =	ip_vs_ftp_init_conn,
+	.done_conn =	ip_vs_ftp_done_conn,
+	.bind_conn =	NULL,
+	.unbind_conn =	NULL,
+	.pkt_out =	ip_vs_ftp_out,
+	.pkt_in =	ip_vs_ftp_in,
+};
+
+
+/*
+ *	ip_vs_ftp initialization
+ */
+static int __init ip_vs_ftp_init(void)
+{
+	int i, ret;
+	struct ip_vs_app *app = &ip_vs_ftp;
+
+	ret = register_ip_vs_app(app);
+	if (ret)
+		return ret;
+
+	for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
+		if (!ports[i])
+			continue;
+		ret = register_ip_vs_app_inc(app, app->protocol, ports[i]);
+		if (ret)
+			break;
+		IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n",
+			  app->name, i, ports[i]);
+	}
+
+	if (ret)
+		unregister_ip_vs_app(app);
+
+	return ret;
+}
+
+
+/*
+ *	ip_vs_ftp finish.
+ */
+static void __exit ip_vs_ftp_exit(void)
+{
+	unregister_ip_vs_app(&ip_vs_ftp);
+}
+
+
+module_init(ip_vs_ftp_init);
+module_exit(ip_vs_ftp_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c
new file mode 100644
index 000000000000..c035838b780a
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblc.c
@@ -0,0 +1,624 @@
+/*
+ * IPVS:        Locality-Based Least-Connection scheduling module
+ *
+ * Version:     $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Martin Hamilton         :    fixed the terrible locking bugs
+ *                                   *lock(tbl->lock) ==> *lock(&tbl->lock)
+ *     Wensong Zhang           :    fixed the uninitilized tbl->lock bug
+ *     Wensong Zhang           :    added doing full expiration check to
+ *                                   collect stale entries of 24+ hours when
+ *                                   no partial expire check in a half hour
+ *     Julian Anastasov        :    replaced del_timer call with del_timer_sync
+ *                                   to avoid the possible race between timer
+ *                                   handler and del_timer thread in SMP
+ *
+ */
+
+/*
+ * The lblc algorithm is as follows (pseudo code):
+ *
+ *       if cachenode[dest_ip] is null then
+ *               n, cachenode[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- cachenode[dest_ip];
+ *               if (n is dead) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                 n, cachenode[dest_ip] <- {weighted least-conn node};
+ *
+ *       return n;
+ *
+ * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing
+ * me to write this module.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblc entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
+
+
+/*
+ *     for IPVS lblc entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
+#define CONFIG_IP_VS_LBLC_TAB_BITS      10
+#endif
+#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
+#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)
+#define IP_VS_LBLC_TAB_MASK     (IP_VS_LBLC_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS lblc entry represents an association between destination
+ *      IP address and its destination server
+ */
+struct ip_vs_lblc_entry {
+	struct list_head        list;
+	__u32                   addr;           /* destination IP address */
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+	unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblc hash table
+ */
+struct ip_vs_lblc_table {
+	rwlock_t	        lock;           /* lock for this table */
+	struct list_head        bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
+	atomic_t                entries;        /* number of entries */
+	int                     max_size;       /* maximum size of entries */
+	struct timer_list       periodic_timer; /* collect stale entries */
+	int                     rover;          /* rover for expire check */
+	int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLC sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+	{
+		.ctl_name	= NET_IPV4_VS_LBLC_EXPIRE,
+		.procname	= "lblc_expiration",
+		.data		= &sysctl_ip_vs_lblc_expiration,
+		.maxlen		= sizeof(int),
+		.mode		= 0644, 
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table vs_table[] = {
+	{
+		.ctl_name	= NET_IPV4_VS,
+		.procname	= "vs",
+		.mode		= 0555, 
+		.child		= vs_vars_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ipv4_table[] = {
+	{
+		.ctl_name	= NET_IPV4,
+		.procname	= "ipv4", 
+		.mode		= 0555,
+		.child		= vs_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table lblc_root_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net", 
+		.mode		= 0555, 
+		.child		= ipv4_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+/*
+ *      new/free a ip_vs_lblc_entry, which is a mapping of a destionation
+ *      IP address to a server.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest)
+{
+	struct ip_vs_lblc_entry *en;
+
+	en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC);
+	if (en == NULL) {
+		IP_VS_ERR("ip_vs_lblc_new(): no memory\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&en->list);
+	en->addr = daddr;
+
+	atomic_inc(&dest->refcnt);
+	en->dest = dest;
+
+	return en;
+}
+
+
+static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
+{
+	list_del(&en->list);
+	/*
+	 * We don't kfree dest because it is refered either by its service
+	 * or the trash dest list.
+	 */
+	atomic_dec(&en->dest->refcnt);
+	kfree(en);
+}
+
+
+/*
+ *	Returns hash value for IPVS LBLC entry
+ */
+static inline unsigned ip_vs_lblc_hashkey(__u32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+}
+
+
+/*
+ *	Hash an entry in the ip_vs_lblc_table.
+ *	returns bool success.
+ */
+static int
+ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
+{
+	unsigned hash;
+
+	if (!list_empty(&en->list)) {
+		IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	/*
+	 *	Hash by destination IP address
+	 */
+	hash = ip_vs_lblc_hashkey(en->addr);
+
+	write_lock(&tbl->lock);
+	list_add(&en->list, &tbl->bucket[hash]);
+	atomic_inc(&tbl->entries);
+	write_unlock(&tbl->lock);
+
+	return 1;
+}
+
+
+#if 0000
+/*
+ *	Unhash ip_vs_lblc_entry from ip_vs_lblc_table.
+ *	returns bool success.
+ */
+static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl,
+			     struct ip_vs_lblc_entry *en)
+{
+	if (list_empty(&en->list)) {
+		IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	/*
+	 * Remove it from the table
+	 */
+	write_lock(&tbl->lock);
+	list_del(&en->list);
+	INIT_LIST_HEAD(&en->list);
+	write_unlock(&tbl->lock);
+
+	return 1;
+}
+#endif
+
+
+/*
+ *  Get ip_vs_lblc_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblc_entry *
+ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr)
+{
+	unsigned hash;
+	struct ip_vs_lblc_entry *en;
+
+	hash = ip_vs_lblc_hashkey(addr);
+
+	read_lock(&tbl->lock);
+
+	list_for_each_entry(en, &tbl->bucket[hash], list) {
+		if (en->addr == addr) {
+			/* HIT */
+			read_unlock(&tbl->lock);
+			return en;
+		}
+	}
+
+	read_unlock(&tbl->lock);
+
+	return NULL;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl)
+{
+	int i;
+	struct ip_vs_lblc_entry *en, *nxt;
+
+	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		write_lock(&tbl->lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&tbl->lock);
+	}
+}
+
+
+static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl)
+{
+	unsigned long now = jiffies;
+	int i, j;
+	struct ip_vs_lblc_entry *en, *nxt;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+		write_lock(&tbl->lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, 
+					en->lastuse + sysctl_ip_vs_lblc_expiration))
+				continue;
+
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&tbl->lock);
+	}
+	tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblc table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblc_check_expire(unsigned long data)
+{
+	struct ip_vs_lblc_table *tbl;
+	unsigned long now = jiffies;
+	int goal;
+	int i, j;
+	struct ip_vs_lblc_entry *en, *nxt;
+
+	tbl = (struct ip_vs_lblc_table *)data;
+
+	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+		/* do full expiration check */
+		ip_vs_lblc_full_check(tbl);
+		tbl->counter = 1;
+		goto out;
+	}
+
+	if (atomic_read(&tbl->entries) <= tbl->max_size) {
+		tbl->counter++;
+		goto out;
+	}
+
+	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+	if (goal > tbl->max_size/2)
+		goal = tbl->max_size/2;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLC_TAB_MASK;
+
+		write_lock(&tbl->lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
+				continue;
+
+			ip_vs_lblc_free(en);
+			atomic_dec(&tbl->entries);
+			goal--;
+		}
+		write_unlock(&tbl->lock);
+		if (goal <= 0)
+			break;
+	}
+	tbl->rover = j;
+
+  out:
+	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_lblc_table *tbl;
+
+	/*
+	 *    Allocate the ip_vs_lblc_table for this service
+	 */
+	tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_lblc_table));
+
+	/*
+	 *    Initialize the hash buckets
+	 */
+	for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) {
+		INIT_LIST_HEAD(&tbl->bucket[i]);
+	}
+	rwlock_init(&tbl->lock);
+	tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
+	tbl->rover = 0;
+	tbl->counter = 1;
+
+	/*
+	 *    Hook periodic timer for garbage collection
+	 */
+	init_timer(&tbl->periodic_timer);
+	tbl->periodic_timer.data = (unsigned long)tbl;
+	tbl->periodic_timer.function = ip_vs_lblc_check_expire;
+	tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+	add_timer(&tbl->periodic_timer);
+
+	return 0;
+}
+
+
+static int ip_vs_lblc_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblc_table *tbl = svc->sched_data;
+
+	/* remove periodic timer */
+	del_timer_sync(&tbl->periodic_timer);
+
+	/* got to clean up table entries here */
+	ip_vs_lblc_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_lblc_table));
+
+	return 0;
+}
+
+
+static int ip_vs_lblc_update_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	/*
+	 * We think the overhead of processing active connections is fifty
+	 * times higher than that of inactive connections in average. (This
+	 * fifty times might not be accurate, we will change it later.) We
+	 * use the following formula to estimate the overhead:
+	 *                dest->activeconns*50 + dest->inactconns
+	 * and the load:
+	 *                (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *                h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connection.
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		if (atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = atomic_read(&least->activeconns) * 50
+				+ atomic_read(&least->inactconns);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+		struct ip_vs_dest *d;
+
+		list_for_each_entry(d, &svc->destinations, n_list) {
+			if (atomic_read(&d->activeconns)*2
+			    < atomic_read(&d->weight)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_lblc_table *tbl;
+	struct ip_vs_lblc_entry *en;
+	struct iphdr *iph = skb->nh.iph;
+
+	IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_lblc_table *)svc->sched_data;
+	en = ip_vs_lblc_get(tbl, iph->daddr);
+	if (en == NULL) {
+		dest = __ip_vs_wlc_schedule(svc, iph);
+		if (dest == NULL) {
+			IP_VS_DBG(1, "no destination available\n");
+			return NULL;
+		}
+		en = ip_vs_lblc_new(iph->daddr, dest);
+		if (en == NULL) {
+			return NULL;
+		}
+		ip_vs_lblc_hash(tbl, en);
+	} else {
+		dest = en->dest;
+		if (!(dest->flags & IP_VS_DEST_F_AVAILABLE)
+		    || atomic_read(&dest->weight) <= 0
+		    || is_overloaded(dest, svc)) {
+			dest = __ip_vs_wlc_schedule(svc, iph);
+			if (dest == NULL) {
+				IP_VS_DBG(1, "no destination available\n");
+				return NULL;
+			}
+			atomic_dec(&en->dest->refcnt);
+			atomic_inc(&dest->refcnt);
+			en->dest = dest;
+		}
+	}
+	en->lastuse = jiffies;
+
+	IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(en->addr),
+		  NIPQUAD(dest->addr),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS LBLC Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblc_scheduler =
+{
+	.name =			"lblc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_lblc_init_svc,
+	.done_service =		ip_vs_lblc_done_svc,
+	.update_service =	ip_vs_lblc_update_svc,
+	.schedule =		ip_vs_lblc_schedule,
+};
+
+
+static int __init ip_vs_lblc_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list);
+	sysctl_header = register_sysctl_table(lblc_root_table, 0);
+	return register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+static void __exit ip_vs_lblc_cleanup(void)
+{
+	unregister_sysctl_table(sysctl_header);
+	unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
+}
+
+
+module_init(ip_vs_lblc_init);
+module_exit(ip_vs_lblc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c
new file mode 100644
index 000000000000..22b5dd55d271
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lblcr.c
@@ -0,0 +1,888 @@
+/*
+ * IPVS:        Locality-Based Least-Connection with Replication scheduler
+ *
+ * Version:     $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Julian Anastasov        :    Added the missing (dest->weight>0)
+ *                                  condition in the ip_vs_dest_set_max.
+ *
+ */
+
+/*
+ * The lblc/r algorithm is as follows (pseudo code):
+ *
+ *       if serverSet[dest_ip] is null then
+ *               n, serverSet[dest_ip] <- {weighted least-conn node};
+ *       else
+ *               n <- {least-conn (alive) node in serverSet[dest_ip]};
+ *               if (n is null) OR
+ *                  (n.conns>n.weight AND
+ *                   there is a node m with m.conns<m.weight/2) then
+ *                   n <- {weighted least-conn node};
+ *                   add n to serverSet[dest_ip];
+ *               if |serverSet[dest_ip]| > 1 AND
+ *                   now - serverSet[dest_ip].lastMod > T then
+ *                   m <- {most conn node in serverSet[dest_ip]};
+ *                   remove m from serverSet[dest_ip];
+ *       if serverSet[dest_ip] changed then
+ *               serverSet[dest_ip].lastMod <- now;
+ *
+ *       return n;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+/* for sysctl */
+#include <linux/fs.h>
+#include <linux/sysctl.h>
+/* for proc_net_create/proc_net_remove */
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *    It is for garbage collection of stale IPVS lblcr entries,
+ *    when the table is full.
+ */
+#define CHECK_EXPIRE_INTERVAL   (60*HZ)
+#define ENTRY_TIMEOUT           (6*60*HZ)
+
+/*
+ *    It is for full expiration check.
+ *    When there is no partial expiration check (garbage collection)
+ *    in a half hour, do a full expiration check to collect stale
+ *    entries that haven't been touched for a day.
+ */
+#define COUNT_FOR_FULL_EXPIRATION   30
+static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
+
+
+/*
+ *     for IPVS lblcr entry hash table
+ */
+#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS
+#define CONFIG_IP_VS_LBLCR_TAB_BITS      10
+#endif
+#define IP_VS_LBLCR_TAB_BITS     CONFIG_IP_VS_LBLCR_TAB_BITS
+#define IP_VS_LBLCR_TAB_SIZE     (1 << IP_VS_LBLCR_TAB_BITS)
+#define IP_VS_LBLCR_TAB_MASK     (IP_VS_LBLCR_TAB_SIZE - 1)
+
+
+/*
+ *      IPVS destination set structure and operations
+ */
+struct ip_vs_dest_list {
+	struct ip_vs_dest_list  *next;          /* list link */
+	struct ip_vs_dest       *dest;          /* destination server */
+};
+
+struct ip_vs_dest_set {
+	atomic_t                size;           /* set size */
+	unsigned long           lastmod;        /* last modified time */
+	struct ip_vs_dest_list  *list;          /* destination list */
+	rwlock_t	        lock;           /* lock for this list */
+};
+
+
+static struct ip_vs_dest_list *
+ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+	struct ip_vs_dest_list *e;
+
+	for (e=set->list; e!=NULL; e=e->next) {
+		if (e->dest == dest)
+			/* already existed */
+			return NULL;
+	}
+
+	e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC);
+	if (e == NULL) {
+		IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n");
+		return NULL;
+	}
+
+	atomic_inc(&dest->refcnt);
+	e->dest = dest;
+
+	/* link it to the list */
+	write_lock(&set->lock);
+	e->next = set->list;
+	set->list = e;
+	atomic_inc(&set->size);
+	write_unlock(&set->lock);
+
+	set->lastmod = jiffies;
+	return e;
+}
+
+static void
+ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest)
+{
+	struct ip_vs_dest_list *e, **ep;
+
+	write_lock(&set->lock);
+	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+		if (e->dest == dest) {
+			/* HIT */
+			*ep = e->next;
+			atomic_dec(&set->size);
+			set->lastmod = jiffies;
+			atomic_dec(&e->dest->refcnt);
+			kfree(e);
+			break;
+		}
+		ep = &e->next;
+	}
+	write_unlock(&set->lock);
+}
+
+static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
+{
+	struct ip_vs_dest_list *e, **ep;
+
+	write_lock(&set->lock);
+	for (ep=&set->list, e=*ep; e!=NULL; e=*ep) {
+		*ep = e->next;
+		/*
+		 * We don't kfree dest because it is refered either
+		 * by its service or by the trash dest list.
+		 */
+		atomic_dec(&e->dest->refcnt);
+		kfree(e);
+	}
+	write_unlock(&set->lock);
+}
+
+/* get weighted least-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+{
+	register struct ip_vs_dest_list *e;
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	if (set == NULL)
+		return NULL;
+
+	read_lock(&set->lock);
+	/* select the first destination server, whose weight > 0 */
+	for (e=set->list; e!=NULL; e=e->next) {
+		least = e->dest;
+		if (least->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		if ((atomic_read(&least->weight) > 0)
+		    && (least->flags & IP_VS_DEST_F_AVAILABLE)) {
+			loh = atomic_read(&least->activeconns) * 50
+				+ atomic_read(&least->inactconns);
+			goto nextstage;
+		}
+	}
+	read_unlock(&set->lock);
+	return NULL;
+
+	/* find the destination with the weighted least load */
+  nextstage:
+	for (e=e->next; e!=NULL; e=e->next) {
+		dest = e->dest;
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		if ((loh * atomic_read(&dest->weight) >
+		     doh * atomic_read(&least->weight))
+		    && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+	read_unlock(&set->lock);
+
+	IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+	return least;
+}
+
+
+/* get weighted most-connection node in the destination set */
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+{
+	register struct ip_vs_dest_list *e;
+	struct ip_vs_dest *dest, *most;
+	int moh, doh;
+
+	if (set == NULL)
+		return NULL;
+
+	read_lock(&set->lock);
+	/* select the first destination server, whose weight > 0 */
+	for (e=set->list; e!=NULL; e=e->next) {
+		most = e->dest;
+		if (atomic_read(&most->weight) > 0) {
+			moh = atomic_read(&most->activeconns) * 50
+				+ atomic_read(&most->inactconns);
+			goto nextstage;
+		}
+	}
+	read_unlock(&set->lock);
+	return NULL;
+
+	/* find the destination with the weighted most load */
+  nextstage:
+	for (e=e->next; e!=NULL; e=e->next) {
+		dest = e->dest;
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
+		if ((moh * atomic_read(&dest->weight) <
+		     doh * atomic_read(&most->weight))
+		    && (atomic_read(&dest->weight) > 0)) {
+			most = dest;
+			moh = doh;
+		}
+	}
+	read_unlock(&set->lock);
+
+	IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(most->addr), ntohs(most->port),
+		  atomic_read(&most->activeconns),
+		  atomic_read(&most->refcnt),
+		  atomic_read(&most->weight), moh);
+	return most;
+}
+
+
+/*
+ *      IPVS lblcr entry represents an association between destination
+ *      IP address and its destination server set
+ */
+struct ip_vs_lblcr_entry {
+	struct list_head        list;
+	__u32                   addr;           /* destination IP address */
+	struct ip_vs_dest_set   set;            /* destination server set */
+	unsigned long           lastuse;        /* last used time */
+};
+
+
+/*
+ *      IPVS lblcr hash table
+ */
+struct ip_vs_lblcr_table {
+	rwlock_t	        lock;           /* lock for this table */
+	struct list_head        bucket[IP_VS_LBLCR_TAB_SIZE];  /* hash bucket */
+	atomic_t                entries;        /* number of entries */
+	int                     max_size;       /* maximum size of entries */
+	struct timer_list       periodic_timer; /* collect stale entries */
+	int                     rover;          /* rover for expire check */
+	int                     counter;        /* counter for no expire */
+};
+
+
+/*
+ *      IPVS LBLCR sysctl table
+ */
+
+static ctl_table vs_vars_table[] = {
+	{
+		.ctl_name	= NET_IPV4_VS_LBLCR_EXPIRE,
+		.procname	= "lblcr_expiration",
+		.data		= &sysctl_ip_vs_lblcr_expiration,
+		.maxlen		= sizeof(int),
+		.mode		= 0644, 
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table vs_table[] = {
+	{
+		.ctl_name	= NET_IPV4_VS,
+		.procname	= "vs",
+		.mode		= 0555,
+		.child		= vs_vars_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ipv4_table[] = {
+	{
+		.ctl_name	= NET_IPV4,
+		.procname	= "ipv4", 
+		.mode		= 0555,
+		.child		= vs_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table lblcr_root_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net", 
+		.mode		= 0555, 
+		.child		= ipv4_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header * sysctl_header;
+
+/*
+ *      new/free a ip_vs_lblcr_entry, which is a mapping of a destination
+ *      IP address to a server.
+ */
+static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr)
+{
+	struct ip_vs_lblcr_entry *en;
+
+	en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC);
+	if (en == NULL) {
+		IP_VS_ERR("ip_vs_lblcr_new(): no memory\n");
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&en->list);
+	en->addr = daddr;
+
+	/* initilize its dest set */
+	atomic_set(&(en->set.size), 0);
+	en->set.list = NULL;
+	rwlock_init(&en->set.lock);
+
+	return en;
+}
+
+
+static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
+{
+	list_del(&en->list);
+	ip_vs_dest_set_eraseall(&en->set);
+	kfree(en);
+}
+
+
+/*
+ *	Returns hash value for IPVS LBLCR entry
+ */
+static inline unsigned ip_vs_lblcr_hashkey(__u32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+}
+
+
+/*
+ *	Hash an entry in the ip_vs_lblcr_table.
+ *	returns bool success.
+ */
+static int
+ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en)
+{
+	unsigned hash;
+
+	if (!list_empty(&en->list)) {
+		IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	/*
+	 *	Hash by destination IP address
+	 */
+	hash = ip_vs_lblcr_hashkey(en->addr);
+
+	write_lock(&tbl->lock);
+	list_add(&en->list, &tbl->bucket[hash]);
+	atomic_inc(&tbl->entries);
+	write_unlock(&tbl->lock);
+
+	return 1;
+}
+
+
+#if 0000
+/*
+ *	Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table.
+ *	returns bool success.
+ */
+static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl,
+			     struct ip_vs_lblcr_entry *en)
+{
+	if (list_empty(&en->list)) {
+		IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, "
+			  "called from %p\n", __builtin_return_address(0));
+		return 0;
+	}
+
+	/*
+	 * Remove it from the table
+	 */
+	write_lock(&tbl->lock);
+	list_del(&en->list);
+	INIT_LIST_HEAD(&en->list);
+	write_unlock(&tbl->lock);
+
+	return 1;
+}
+#endif
+
+
+/*
+ *  Get ip_vs_lblcr_entry associated with supplied parameters.
+ */
+static inline struct ip_vs_lblcr_entry *
+ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr)
+{
+	unsigned hash;
+	struct ip_vs_lblcr_entry *en;
+
+	hash = ip_vs_lblcr_hashkey(addr);
+
+	read_lock(&tbl->lock);
+
+	list_for_each_entry(en, &tbl->bucket[hash], list) {
+		if (en->addr == addr) {
+			/* HIT */
+			read_unlock(&tbl->lock);
+			return en;
+		}
+	}
+
+	read_unlock(&tbl->lock);
+
+	return NULL;
+}
+
+
+/*
+ *      Flush all the entries of the specified table.
+ */
+static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl)
+{
+	int i;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		write_lock(&tbl->lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) {
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&tbl->lock);
+	}
+}
+
+
+static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl)
+{
+	unsigned long now = jiffies;
+	int i, j;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+		write_lock(&tbl->lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration,
+				       now))
+				continue;
+
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+		}
+		write_unlock(&tbl->lock);
+	}
+	tbl->rover = j;
+}
+
+
+/*
+ *      Periodical timer handler for IPVS lblcr table
+ *      It is used to collect stale entries when the number of entries
+ *      exceeds the maximum size of the table.
+ *
+ *      Fixme: we probably need more complicated algorithm to collect
+ *             entries that have not been used for a long time even
+ *             if the number of entries doesn't exceed the maximum size
+ *             of the table.
+ *      The full expiration check is for this purpose now.
+ */
+static void ip_vs_lblcr_check_expire(unsigned long data)
+{
+	struct ip_vs_lblcr_table *tbl;
+	unsigned long now = jiffies;
+	int goal;
+	int i, j;
+	struct ip_vs_lblcr_entry *en, *nxt;
+
+	tbl = (struct ip_vs_lblcr_table *)data;
+
+	if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
+		/* do full expiration check */
+		ip_vs_lblcr_full_check(tbl);
+		tbl->counter = 1;
+		goto out;
+	}
+
+	if (atomic_read(&tbl->entries) <= tbl->max_size) {
+		tbl->counter++;
+		goto out;
+	}
+
+	goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
+	if (goal > tbl->max_size/2)
+		goal = tbl->max_size/2;
+
+	for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
+
+		write_lock(&tbl->lock);
+		list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
+			if (time_before(now, en->lastuse+ENTRY_TIMEOUT))
+				continue;
+
+			ip_vs_lblcr_free(en);
+			atomic_dec(&tbl->entries);
+			goal--;
+		}
+		write_unlock(&tbl->lock);
+		if (goal <= 0)
+			break;
+	}
+	tbl->rover = j;
+
+  out:
+	mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL);
+}
+
+
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+static struct ip_vs_lblcr_table *lblcr_table_list;
+
+/*
+ *	/proc/net/ip_vs_lblcr to display the mappings of
+ *                  destination IP address <==> its serverSet
+ */
+static int
+ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length)
+{
+	off_t pos=0, begin;
+	int len=0, size;
+	struct ip_vs_lblcr_table *tbl;
+	unsigned long now = jiffies;
+	int i;
+	struct ip_vs_lblcr_entry *en;
+
+	tbl = lblcr_table_list;
+
+	size = sprintf(buffer, "LastTime Dest IP address  Server set\n");
+	pos += size;
+	len += size;
+
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		read_lock_bh(&tbl->lock);
+		list_for_each_entry(en, &tbl->bucket[i], list) {
+			char tbuf[16];
+			struct ip_vs_dest_list *d;
+
+			sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr));
+			size = sprintf(buffer+len, "%8lu %-16s ",
+				       now-en->lastuse, tbuf);
+
+			read_lock(&en->set.lock);
+			for (d=en->set.list; d!=NULL; d=d->next) {
+				size += sprintf(buffer+len+size,
+						"%u.%u.%u.%u ",
+						NIPQUAD(d->dest->addr));
+			}
+			read_unlock(&en->set.lock);
+			size += sprintf(buffer+len+size, "\n");
+			len += size;
+			pos += size;
+			if (pos <= offset)
+				len=0;
+			if (pos >= offset+length) {
+				read_unlock_bh(&tbl->lock);
+				goto done;
+			}
+		}
+		read_unlock_bh(&tbl->lock);
+	}
+
+  done:
+	begin = len - (pos - offset);
+	*start = buffer + begin;
+	len -= begin;
+	if(len>length)
+		len = length;
+	return len;
+}
+#endif
+
+
+static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_lblcr_table *tbl;
+
+	/*
+	 *    Allocate the ip_vs_lblcr_table for this service
+	 */
+	tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_lblcr_table));
+
+	/*
+	 *    Initialize the hash buckets
+	 */
+	for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) {
+		INIT_LIST_HEAD(&tbl->bucket[i]);
+	}
+	rwlock_init(&tbl->lock);
+	tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16;
+	tbl->rover = 0;
+	tbl->counter = 1;
+
+	/*
+	 *    Hook periodic timer for garbage collection
+	 */
+	init_timer(&tbl->periodic_timer);
+	tbl->periodic_timer.data = (unsigned long)tbl;
+	tbl->periodic_timer.function = ip_vs_lblcr_check_expire;
+	tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL;
+	add_timer(&tbl->periodic_timer);
+
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+	lblcr_table_list = tbl;
+#endif
+	return 0;
+}
+
+
+static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_lblcr_table *tbl = svc->sched_data;
+
+	/* remove periodic timer */
+	del_timer_sync(&tbl->periodic_timer);
+
+	/* got to clean up table entries here */
+	ip_vs_lblcr_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_lblcr_table));
+
+	return 0;
+}
+
+
+static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static inline struct ip_vs_dest *
+__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
+{
+	struct ip_vs_dest *dest, *least;
+	int loh, doh;
+
+	/*
+	 * We think the overhead of processing active connections is fifty
+	 * times higher than that of inactive connections in average. (This
+	 * fifty times might not be accurate, we will change it later.) We
+	 * use the following formula to estimate the overhead:
+	 *                dest->activeconns*50 + dest->inactconns
+	 * and the load:
+	 *                (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *                h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connection.
+	 */
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		if (atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = atomic_read(&least->activeconns) * 50
+				+ atomic_read(&least->inactconns);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+
+		doh = atomic_read(&dest->activeconns) * 50
+			+ atomic_read(&dest->inactconns);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+/*
+ *   If this destination server is overloaded and there is a less loaded
+ *   server, then return true.
+ */
+static inline int
+is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+	if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) {
+		struct ip_vs_dest *d;
+
+		list_for_each_entry(d, &svc->destinations, n_list) {
+			if (atomic_read(&d->activeconns)*2
+			    < atomic_read(&d->weight)) {
+				return 1;
+			}
+		}
+	}
+	return 0;
+}
+
+
+/*
+ *    Locality-Based (weighted) Least-Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_lblcr_table *tbl;
+	struct ip_vs_lblcr_entry *en;
+	struct iphdr *iph = skb->nh.iph;
+
+	IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_lblcr_table *)svc->sched_data;
+	en = ip_vs_lblcr_get(tbl, iph->daddr);
+	if (en == NULL) {
+		dest = __ip_vs_wlc_schedule(svc, iph);
+		if (dest == NULL) {
+			IP_VS_DBG(1, "no destination available\n");
+			return NULL;
+		}
+		en = ip_vs_lblcr_new(iph->daddr);
+		if (en == NULL) {
+			return NULL;
+		}
+		ip_vs_dest_set_insert(&en->set, dest);
+		ip_vs_lblcr_hash(tbl, en);
+	} else {
+		dest = ip_vs_dest_set_min(&en->set);
+		if (!dest || is_overloaded(dest, svc)) {
+			dest = __ip_vs_wlc_schedule(svc, iph);
+			if (dest == NULL) {
+				IP_VS_DBG(1, "no destination available\n");
+				return NULL;
+			}
+			ip_vs_dest_set_insert(&en->set, dest);
+		}
+		if (atomic_read(&en->set.size) > 1 &&
+		    jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) {
+			struct ip_vs_dest *m;
+			m = ip_vs_dest_set_max(&en->set);
+			if (m)
+				ip_vs_dest_set_erase(&en->set, m);
+		}
+	}
+	en->lastuse = jiffies;
+
+	IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(en->addr),
+		  NIPQUAD(dest->addr),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS LBLCR Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
+{
+	.name =			"lblcr",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_lblcr_init_svc,
+	.done_service =		ip_vs_lblcr_done_svc,
+	.update_service =	ip_vs_lblcr_update_svc,
+	.schedule =		ip_vs_lblcr_schedule,
+};
+
+
+static int __init ip_vs_lblcr_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list);
+	sysctl_header = register_sysctl_table(lblcr_root_table, 0);
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+	proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo);
+#endif
+	return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+static void __exit ip_vs_lblcr_cleanup(void)
+{
+#ifdef CONFIG_IP_VS_LBLCR_DEBUG
+	proc_net_remove("ip_vs_lblcr");
+#endif
+	unregister_sysctl_table(sysctl_header);
+	unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
+}
+
+
+module_init(ip_vs_lblcr_init);
+module_exit(ip_vs_lblcr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c
new file mode 100644
index 000000000000..d88fef90a641
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_lc.c
@@ -0,0 +1,123 @@
+/*
+ * IPVS:        Least-Connection Scheduling module
+ *
+ * Version:     $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     added the ip_vs_lc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_lc_init_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int ip_vs_lc_done_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int ip_vs_lc_update_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static inline unsigned int
+ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We think the overhead of processing active connections is 256
+	 * times higher than that of inactive connections in average. (This
+	 * 256 times might not be accurate, we will change it later) We
+	 * use the following formula to estimate the overhead now:
+	 *		  dest->activeconns*256 + dest->inactconns
+	 */
+	return (atomic_read(&dest->activeconns) << 8) +
+		atomic_read(&dest->inactconns);
+}
+
+
+/*
+ *	Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least = NULL;
+	unsigned int loh = 0, doh;
+
+	IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n");
+
+	/*
+	 * Simply select the server with the least number of
+	 *        (activeconns<<5) + inactconns
+	 * Except whose weight is equal to zero.
+	 * If the weight is equal to zero, it means that the server is
+	 * quiesced, the existing connections to the server still get
+	 * served, but no new connection is assigned to the server.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
+		    atomic_read(&dest->weight) == 0)
+			continue;
+		doh = ip_vs_lc_dest_overhead(dest);
+		if (!least || doh < loh) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	if (least)
+	IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->inactconns));
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_lc_scheduler = {
+	.name =			"lc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_lc_init_svc,
+	.done_service =		ip_vs_lc_done_svc,
+	.update_service =	ip_vs_lc_update_svc,
+	.schedule =		ip_vs_lc_schedule,
+};
+
+
+static int __init ip_vs_lc_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ;
+}
+
+static void __exit ip_vs_lc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_lc_scheduler);
+}
+
+module_init(ip_vs_lc_init);
+module_exit(ip_vs_lc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c
new file mode 100644
index 000000000000..bc2a9e5f2a7b
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_nq.c
@@ -0,0 +1,161 @@
+/*
+ * IPVS:        Never Queue scheduling module
+ *
+ * Version:     $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The NQ algorithm adopts a two-speed model. When there is an idle server
+ * available, the job will be sent to the idle server, instead of waiting
+ * for a fast one. When there is no idle server available, the job will be
+ * sent to the server that minimize its expected delay (The Shortest
+ * Expected Delay scheduling algorithm).
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me.
+ *
+ * The difference between NQ and SED is that NQ can improve overall
+ * system utilization.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_nq_init_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_nq_done_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_nq_update_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static inline unsigned int
+ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We only use the active connection number in the cost
+	 * calculation here.
+	 */
+	return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least = NULL;
+	unsigned int loh = 0, doh;
+
+	IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *	(server expected overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD ||
+		    !atomic_read(&dest->weight))
+			continue;
+
+		doh = ip_vs_nq_dest_overhead(dest);
+
+		/* return the server directly if it is idle */
+		if (atomic_read(&dest->activeconns) == 0) {
+			least = dest;
+			loh = doh;
+			goto out;
+		}
+
+		if (!least ||
+		    (loh * atomic_read(&dest->weight) >
+		     doh * atomic_read(&least->weight))) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	if (!least)
+		return NULL;
+
+  out:
+	IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_nq_scheduler =
+{
+	.name =			"nq",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_nq_init_svc,
+	.done_service =		ip_vs_nq_done_svc,
+	.update_service =	ip_vs_nq_update_svc,
+	.schedule =		ip_vs_nq_schedule,
+};
+
+
+static int __init ip_vs_nq_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+static void __exit ip_vs_nq_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_nq_scheduler);
+}
+
+module_init(ip_vs_nq_init);
+module_exit(ip_vs_nq_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c
new file mode 100644
index 000000000000..253c46252bd5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto.c
@@ -0,0 +1,244 @@
+/*
+ * ip_vs_proto.c: transport protocol load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <asm/system.h>
+#include <linux/stat.h>
+#include <linux/proc_fs.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ * IPVS protocols can only be registered/unregistered when the ipvs
+ * module is loaded/unloaded, so no lock is needed in accessing the
+ * ipvs protocol table.
+ */
+
+#define IP_VS_PROTO_TAB_SIZE		32	/* must be power of 2 */
+#define IP_VS_PROTO_HASH(proto)		((proto) & (IP_VS_PROTO_TAB_SIZE-1))
+
+static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+
+
+/*
+ *	register an ipvs protocol
+ */
+static int register_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+	pp->next = ip_vs_proto_table[hash];
+	ip_vs_proto_table[hash] = pp;
+
+	if (pp->init != NULL)
+		pp->init(pp);
+
+	return 0;
+}
+
+
+/*
+ *	unregister an ipvs protocol
+ */
+static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
+{
+	struct ip_vs_protocol **pp_p;
+	unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
+
+	pp_p = &ip_vs_proto_table[hash];
+	for (; *pp_p; pp_p = &(*pp_p)->next) {
+		if (*pp_p == pp) {
+			*pp_p = pp->next;
+			if (pp->exit != NULL)
+				pp->exit(pp);
+			return 0;
+		}
+	}
+
+	return -ESRCH;
+}
+
+
+/*
+ *	get ip_vs_protocol object by its proto.
+ */
+struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
+{
+	struct ip_vs_protocol *pp;
+	unsigned hash = IP_VS_PROTO_HASH(proto);
+
+	for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) {
+		if (pp->protocol == proto)
+			return pp;
+	}
+
+	return NULL;
+}
+
+
+/*
+ *	Propagate event for state change to all protocols
+ */
+void ip_vs_protocol_timeout_change(int flags)
+{
+	struct ip_vs_protocol *pp;
+	int i;
+
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) {
+			if (pp->timeout_change)
+				pp->timeout_change(pp, flags);
+		}
+	}
+}
+
+
+int *
+ip_vs_create_timeout_table(int *table, int size)
+{
+	int *t;
+
+	t = kmalloc(size, GFP_ATOMIC);
+	if (t == NULL)
+		return NULL;
+	memcpy(t, table, size);
+	return t;
+}
+
+
+/*
+ *	Set timeout value for state specified by name
+ */
+int
+ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to)
+{
+	int i;
+
+	if (!table || !name || !to)
+		return -EINVAL;
+
+	for (i = 0; i < num; i++) {
+		if (strcmp(names[i], name))
+			continue;
+		table[i] = to * HZ;
+		return 0;
+	}
+	return -ENOENT;
+}
+
+
+const char * ip_vs_state_name(__u16 proto, int state)
+{
+	struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+
+	if (pp == NULL || pp->state_name == NULL)
+		return "ERR!";
+	return pp->state_name(state);
+}
+
+
+void
+ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
+			  const struct sk_buff *skb,
+			  int offset,
+			  const char *msg)
+{
+	char buf[128];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else if (ih->frag_off & __constant_htons(IP_OFFSET))
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+	else {
+		__u16 _ports[2], *pptr
+;
+		pptr = skb_header_pointer(skb, offset + ih->ihl*4,
+					  sizeof(_ports), _ports);
+		if (pptr == NULL)
+			sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
+				pp->name,
+				NIPQUAD(ih->saddr),
+				NIPQUAD(ih->daddr));
+		else
+			sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
+				pp->name,
+				NIPQUAD(ih->saddr),
+				ntohs(pptr[0]),
+				NIPQUAD(ih->daddr),
+				ntohs(pptr[1]));
+	}
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+int ip_vs_protocol_init(void)
+{
+	char protocols[64];
+#define REGISTER_PROTOCOL(p)			\
+	do {					\
+		register_ip_vs_protocol(p);	\
+		strcat(protocols, ", ");	\
+		strcat(protocols, (p)->name);	\
+	} while (0)
+
+	protocols[0] = '\0';
+	protocols[2] = '\0';
+#ifdef CONFIG_IP_VS_PROTO_TCP
+	REGISTER_PROTOCOL(&ip_vs_protocol_tcp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_UDP
+	REGISTER_PROTOCOL(&ip_vs_protocol_udp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ICMP
+	REGISTER_PROTOCOL(&ip_vs_protocol_icmp);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_AH
+	REGISTER_PROTOCOL(&ip_vs_protocol_ah);
+#endif
+#ifdef CONFIG_IP_VS_PROTO_ESP
+	REGISTER_PROTOCOL(&ip_vs_protocol_esp);
+#endif
+	IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]);
+
+	return 0;
+}
+
+
+void ip_vs_protocol_cleanup(void)
+{
+	struct ip_vs_protocol *pp;
+	int i;
+
+	/* unregister all the ipvs protocols */
+	for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
+		while ((pp = ip_vs_proto_table[i]) != NULL)
+			unregister_ip_vs_protocol(pp);
+	}
+}
diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c
new file mode 100644
index 000000000000..453e94a0bbd7
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_ah.c
@@ -0,0 +1,177 @@
+/*
+ * ip_vs_proto_ah.c:	AH IPSec load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
+ *		Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+	__u8		icookie[8];
+	__u8		rcookie[8];
+	__u8		np;
+	__u8		version;
+	__u8		xchgtype;
+	__u8		flags;
+	__u32		msgid;
+	__u32		length;
+};
+
+*/
+
+#define PORT_ISAKMP	500
+
+
+static struct ip_vs_conn *
+ah_conn_in_get(const struct sk_buff *skb,
+	       struct ip_vs_protocol *pp,
+	       const struct iphdr *iph,
+	       unsigned int proto_off,
+	       int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(IPPROTO_UDP,
+				       iph->saddr,
+				       __constant_htons(PORT_ISAKMP),
+				       iph->daddr,
+				       __constant_htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_in_get(IPPROTO_UDP,
+				       iph->daddr,
+				       __constant_htons(PORT_ISAKMP),
+				       iph->saddr,
+				       __constant_htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		/*
+		 * We are not sure if the packet is from our
+		 * service, so our conn_schedule hook should return NF_ACCEPT
+		 */
+		IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+			  inverse ? "ICMP+" : "",
+			  pp->name,
+			  NIPQUAD(iph->saddr),
+			  NIPQUAD(iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+					iph->saddr,
+					__constant_htons(PORT_ISAKMP),
+					iph->daddr,
+					__constant_htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+					iph->daddr,
+					__constant_htons(PORT_ISAKMP),
+					iph->saddr,
+					__constant_htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+			  inverse ? "ICMP+" : "",
+			  pp->name,
+			  NIPQUAD(iph->saddr),
+			  NIPQUAD(iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static int
+ah_conn_schedule(struct sk_buff *skb,
+		 struct ip_vs_protocol *pp,
+		 int *verdict, struct ip_vs_conn **cpp)
+{
+	/*
+	 * AH is only related traffic. Pass the packet to IP stack.
+	 */
+	*verdict = NF_ACCEPT;
+	return 0;
+}
+
+
+static void
+ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		int offset, const char *msg)
+{
+	char buf[256];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void ah_init(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+static void ah_exit(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_ah = {
+	.name =			"AH",
+	.protocol =		IPPROTO_AH,
+	.dont_defrag =		1,
+	.init =			ah_init,
+	.exit =			ah_exit,
+	.conn_schedule =	ah_conn_schedule,
+	.conn_in_get =		ah_conn_in_get,
+	.conn_out_get =		ah_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		ah_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+	.set_state_timeout =	NULL,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c
new file mode 100644
index 000000000000..478e5c7c7e8e
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_esp.c
@@ -0,0 +1,175 @@
+/*
+ * ip_vs_proto_esp.c:	ESP IPSec load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, February 2002
+ *		Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/* TODO:
+
+struct isakmp_hdr {
+	__u8		icookie[8];
+	__u8		rcookie[8];
+	__u8		np;
+	__u8		version;
+	__u8		xchgtype;
+	__u8		flags;
+	__u32		msgid;
+	__u32		length;
+};
+
+*/
+
+#define PORT_ISAKMP	500
+
+
+static struct ip_vs_conn *
+esp_conn_in_get(const struct sk_buff *skb,
+		struct ip_vs_protocol *pp,
+		const struct iphdr *iph,
+		unsigned int proto_off,
+		int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(IPPROTO_UDP,
+				       iph->saddr,
+				       __constant_htons(PORT_ISAKMP),
+				       iph->daddr,
+				       __constant_htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_in_get(IPPROTO_UDP,
+				       iph->daddr,
+				       __constant_htons(PORT_ISAKMP),
+				       iph->saddr,
+				       __constant_htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		/*
+		 * We are not sure if the packet is from our
+		 * service, so our conn_schedule hook should return NF_ACCEPT
+		 */
+		IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet "
+			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+			  inverse ? "ICMP+" : "",
+			  pp->name,
+			  NIPQUAD(iph->saddr),
+			  NIPQUAD(iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+					iph->saddr,
+					__constant_htons(PORT_ISAKMP),
+					iph->daddr,
+					__constant_htons(PORT_ISAKMP));
+	} else {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+					iph->daddr,
+					__constant_htons(PORT_ISAKMP),
+					iph->saddr,
+					__constant_htons(PORT_ISAKMP));
+	}
+
+	if (!cp) {
+		IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet "
+			  "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n",
+			  inverse ? "ICMP+" : "",
+			  pp->name,
+			  NIPQUAD(iph->saddr),
+			  NIPQUAD(iph->daddr));
+	}
+
+	return cp;
+}
+
+
+static int
+esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	/*
+	 * ESP is only related traffic. Pass the packet to IP stack.
+	 */
+	*verdict = NF_ACCEPT;
+	return 0;
+}
+
+
+static void
+esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
+		 int offset, const char *msg)
+{
+	char buf[256];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+
+static void esp_init(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+static void esp_exit(struct ip_vs_protocol *pp)
+{
+	/* nothing to do now */
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_esp = {
+	.name =			"ESP",
+	.protocol =		IPPROTO_ESP,
+	.dont_defrag =		1,
+	.init =			esp_init,
+	.exit =			esp_exit,
+	.conn_schedule =	esp_conn_schedule,
+	.conn_in_get =		esp_conn_in_get,
+	.conn_out_get =		esp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		NULL,
+	.state_transition =	NULL,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		esp_debug_packet,
+	.timeout_change =	NULL,		/* ISAKMP */
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c
new file mode 100644
index 000000000000..191e94aa1c1f
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c
@@ -0,0 +1,182 @@
+/*
+ * ip_vs_proto_icmp.c:	ICMP load balancing support for IP Virtual Server
+ *
+ * Authors:	Julian Anastasov <ja@ssi.bg>, March 2002
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		version 2 as published by the Free Software Foundation;
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static int icmp_timeouts[1] =		{ 1*60*HZ };
+
+static char * icmp_state_name_table[1] = { "ICMP" };
+
+static struct ip_vs_conn *
+icmp_conn_in_get(const struct sk_buff *skb,
+		 struct ip_vs_protocol *pp,
+		 const struct iphdr *iph,
+		 unsigned int proto_off,
+		 int inverse)
+{
+#if 0
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(iph->protocol,
+			iph->saddr, 0,
+			iph->daddr, 0);
+	} else {
+		cp = ip_vs_conn_in_get(iph->protocol,
+			iph->daddr, 0,
+			iph->saddr, 0);
+	}
+
+	return cp;
+
+#else
+	return NULL;
+#endif
+}
+
+static struct ip_vs_conn *
+icmp_conn_out_get(const struct sk_buff *skb,
+		  struct ip_vs_protocol *pp,
+		  const struct iphdr *iph,
+		  unsigned int proto_off,
+		  int inverse)
+{
+#if 0
+	struct ip_vs_conn *cp;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(iph->protocol,
+			iph->saddr, 0,
+			iph->daddr, 0);
+	} else {
+		cp = ip_vs_conn_out_get(IPPROTO_UDP,
+			iph->daddr, 0,
+			iph->saddr, 0);
+	}
+
+	return cp;
+#else
+	return NULL;
+#endif
+}
+
+static int
+icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		   int *verdict, struct ip_vs_conn **cpp)
+{
+	*verdict = NF_ACCEPT;
+	return 0;
+}
+
+static int
+icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
+		if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
+			if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
+				IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
+				return 0;
+			}
+		}
+	}
+	return 1;
+}
+
+static void
+icmp_debug_packet(struct ip_vs_protocol *pp,
+		  const struct sk_buff *skb,
+		  int offset,
+		  const char *msg)
+{
+	char buf[256];
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph);
+	if (ih == NULL)
+		sprintf(buf, "%s TRUNCATED", pp->name);
+	else if (ih->frag_off & __constant_htons(IP_OFFSET))
+		sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
+			pp->name, NIPQUAD(ih->saddr),
+			NIPQUAD(ih->daddr));
+	else {
+		struct icmphdr _icmph, *ic;
+
+		ic = skb_header_pointer(skb, offset + ih->ihl*4,
+					sizeof(_icmph), &_icmph);
+		if (ic == NULL)
+			sprintf(buf, "%s TRUNCATED to %u bytes\n",
+				pp->name, skb->len - offset);
+		else
+			sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
+				pp->name, NIPQUAD(ih->saddr),
+				NIPQUAD(ih->daddr),
+				ic->type, ic->code);
+	}
+	printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
+}
+
+static int
+icmp_state_transition(struct ip_vs_conn *cp, int direction,
+		      const struct sk_buff *skb,
+		      struct ip_vs_protocol *pp)
+{
+	cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
+	return 1;
+}
+
+static int
+icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+	int num;
+	char **names;
+
+	num = IP_VS_ICMP_S_LAST;
+	names = icmp_state_name_table;
+	return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to);
+}
+
+
+static void icmp_init(struct ip_vs_protocol *pp)
+{
+	pp->timeout_table = icmp_timeouts;
+}
+
+static void icmp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+struct ip_vs_protocol ip_vs_protocol_icmp = {
+	.name =			"ICMP",
+	.protocol =		IPPROTO_ICMP,
+	.dont_defrag =		0,
+	.init =			icmp_init,
+	.exit =			icmp_exit,
+	.conn_schedule =	icmp_conn_schedule,
+	.conn_in_get =		icmp_conn_in_get,
+	.conn_out_get =		icmp_conn_out_get,
+	.snat_handler =		NULL,
+	.dnat_handler =		NULL,
+	.csum_check =		icmp_csum_check,
+	.state_transition =	icmp_state_transition,
+	.register_app =		NULL,
+	.unregister_app =	NULL,
+	.app_conn_bind =	NULL,
+	.debug_packet =		icmp_debug_packet,
+	.timeout_change =	NULL,
+	.set_state_timeout =	icmp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c
new file mode 100644
index 000000000000..e65de675da74
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c
@@ -0,0 +1,640 @@
+/*
+ * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/ip.h>
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	__u16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		return ip_vs_conn_in_get(iph->protocol,
+					 iph->saddr, pptr[0],
+					 iph->daddr, pptr[1]);
+	} else {
+		return ip_vs_conn_in_get(iph->protocol,
+					 iph->daddr, pptr[1],
+					 iph->saddr, pptr[0]);
+	}
+}
+
+static struct ip_vs_conn *
+tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	__u16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		return ip_vs_conn_out_get(iph->protocol,
+					  iph->saddr, pptr[0],
+					  iph->daddr, pptr[1]);
+	} else {
+		return ip_vs_conn_out_get(iph->protocol,
+					  iph->daddr, pptr[1],
+					  iph->saddr, pptr[0]);
+	}
+}
+
+
+static int
+tcp_conn_schedule(struct sk_buff *skb,
+		  struct ip_vs_protocol *pp,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	struct ip_vs_service *svc;
+	struct tcphdr _tcph, *th;
+
+	th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		*verdict = NF_DROP;
+		return 0;
+	}
+
+	if (th->syn &&
+	    (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+				     skb->nh.iph->daddr, th->dest))) {
+		if (ip_vs_todrop()) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb);
+		if (!*cpp) {
+			*verdict = ip_vs_leave(svc, skb, pp);
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	return 1;
+}
+
+
+static inline void
+tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
+		     u16 oldport, u16 newport)
+{
+	tcph->check =
+		ip_vs_check_diff(~oldip, newip,
+				 ip_vs_check_diff(oldport ^ 0xFFFF,
+						  newport, tcph->check));
+}
+
+
+static int
+tcp_snat_handler(struct sk_buff **pskb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct tcphdr *tcph;
+	unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+
+	/* csum_check requires unshared skb */
+	if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(*pskb, pp))
+			return 0;
+
+		/* Call application helper if needed */
+		if (!ip_vs_app_pkt_out(cp, pskb))
+			return 0;
+	}
+
+	tcph = (void *)(*pskb)->nh.iph + tcphoff;
+	tcph->source = cp->vport;
+
+	/* Adjust TCP checksums */
+	if (!cp->app) {
+		/* Only port and addr are changed, do fast csum update */
+		tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
+				     cp->dport, cp->vport);
+		if ((*pskb)->ip_summed == CHECKSUM_HW)
+			(*pskb)->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		tcph->check = 0;
+		(*pskb)->csum = skb_checksum(*pskb, tcphoff,
+					     (*pskb)->len - tcphoff, 0);
+		tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+						(*pskb)->len - tcphoff,
+						cp->protocol,
+						(*pskb)->csum);
+		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+			  pp->name, tcph->check,
+			  (char*)&(tcph->check) - (char*)tcph);
+	}
+	return 1;
+}
+
+
+static int
+tcp_dnat_handler(struct sk_buff **pskb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct tcphdr *tcph;
+	unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
+
+	/* csum_check requires unshared skb */
+	if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(*pskb, pp))
+			return 0;
+
+		/*
+		 *	Attempt ip_vs_app call.
+		 *	It will fix ip_vs_conn and iph ack_seq stuff
+		 */
+		if (!ip_vs_app_pkt_in(cp, pskb))
+			return 0;
+	}
+
+	tcph = (void *)(*pskb)->nh.iph + tcphoff;
+	tcph->dest = cp->dport;
+
+	/*
+	 *	Adjust TCP checksums
+	 */
+	if (!cp->app) {
+		/* Only port and addr are changed, do fast csum update */
+		tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
+				     cp->vport, cp->dport);
+		if ((*pskb)->ip_summed == CHECKSUM_HW)
+			(*pskb)->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		tcph->check = 0;
+		(*pskb)->csum = skb_checksum(*pskb, tcphoff,
+					     (*pskb)->len - tcphoff, 0);
+		tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+						(*pskb)->len - tcphoff,
+						cp->protocol,
+						(*pskb)->csum);
+		(*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return 1;
+}
+
+
+static int
+tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	unsigned int tcphoff = skb->nh.iph->ihl*4;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_NONE:
+		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
+	case CHECKSUM_HW:
+		if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
+				      skb->len - tcphoff,
+				      skb->nh.iph->protocol, skb->csum)) {
+			IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+					 "Failed checksum for");
+			return 0;
+		}
+		break;
+	default:
+		/* CHECKSUM_UNNECESSARY */
+		break;
+	}
+
+	return 1;
+}
+
+
+#define TCP_DIR_INPUT		0
+#define TCP_DIR_OUTPUT		4
+#define TCP_DIR_INPUT_ONLY	8
+
+static int tcp_state_off[IP_VS_DIR_LAST] = {
+	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
+	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
+	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
+};
+
+/*
+ *	Timeout table[state]
+ */
+static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	2*HZ,
+	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
+	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
+	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
+	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
+	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
+	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
+	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
+	[IP_VS_TCP_S_LAST]		=	2*HZ,
+};
+
+
+#if 0
+
+/* FIXME: This is going to die */
+
+static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	2*HZ,
+	[IP_VS_TCP_S_ESTABLISHED]	=	8*60*HZ,
+	[IP_VS_TCP_S_SYN_SENT]		=	60*HZ,
+	[IP_VS_TCP_S_SYN_RECV]		=	10*HZ,
+	[IP_VS_TCP_S_FIN_WAIT]		=	60*HZ,
+	[IP_VS_TCP_S_TIME_WAIT]		=	60*HZ,
+	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
+	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
+	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
+	[IP_VS_TCP_S_SYNACK]		=	100*HZ,
+	[IP_VS_TCP_S_LAST]		=	2*HZ,
+};
+
+#endif
+
+static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
+	[IP_VS_TCP_S_NONE]		=	"NONE",
+	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
+	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
+	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
+	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
+	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
+	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
+	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
+	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
+	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
+	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
+	[IP_VS_TCP_S_LAST]		=	"BUG!",
+};
+
+#define sNO IP_VS_TCP_S_NONE
+#define sES IP_VS_TCP_S_ESTABLISHED
+#define sSS IP_VS_TCP_S_SYN_SENT
+#define sSR IP_VS_TCP_S_SYN_RECV
+#define sFW IP_VS_TCP_S_FIN_WAIT
+#define sTW IP_VS_TCP_S_TIME_WAIT
+#define sCL IP_VS_TCP_S_CLOSE
+#define sCW IP_VS_TCP_S_CLOSE_WAIT
+#define sLA IP_VS_TCP_S_LAST_ACK
+#define sLI IP_VS_TCP_S_LISTEN
+#define sSA IP_VS_TCP_S_SYNACK
+
+struct tcp_states_t {
+	int next_state[IP_VS_TCP_S_LAST];
+};
+
+static const char * tcp_state_name(int state)
+{
+	if (state >= IP_VS_TCP_S_LAST)
+		return "ERR!";
+	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
+}
+
+static struct tcp_states_t tcp_states [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t tcp_states_dos [] = {
+/*	INPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
+/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
+/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+
+/*	OUTPUT */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
+/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
+/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
+/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
+
+/*	INPUT-ONLY */
+/*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
+/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
+/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
+/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
+/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
+};
+
+static struct tcp_states_t *tcp_state_table = tcp_states;
+
+
+static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
+{
+	int on = (flags & 1);		/* secure_tcp */
+
+	/*
+	** FIXME: change secure_tcp to independent sysctl var
+	** or make it per-service or per-app because it is valid
+	** for most if not for all of the applications. Something
+	** like "capabilities" (flags) for each object.
+	*/
+	tcp_state_table = (on? tcp_states_dos : tcp_states);
+}
+
+static int
+tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
+				       tcp_state_name_table, sname, to);
+}
+
+static inline int tcp_state_idx(struct tcphdr *th)
+{
+	if (th->rst)
+		return 3;
+	if (th->syn)
+		return 0;
+	if (th->fin)
+		return 1;
+	if (th->ack)
+		return 2;
+	return -1;
+}
+
+static inline void
+set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
+	      int direction, struct tcphdr *th)
+{
+	int state_idx;
+	int new_state = IP_VS_TCP_S_CLOSE;
+	int state_off = tcp_state_off[direction];
+
+	/*
+	 *    Update state offset to INPUT_ONLY if necessary
+	 *    or delete NO_OUTPUT flag if output packet detected
+	 */
+	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
+		if (state_off == TCP_DIR_OUTPUT)
+			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
+		else
+			state_off = TCP_DIR_INPUT_ONLY;
+	}
+
+	if ((state_idx = tcp_state_idx(th)) < 0) {
+		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
+		goto tcp_state_out;
+	}
+
+	new_state = tcp_state_table[state_off+state_idx].next_state[cp->state];
+
+  tcp_state_out:
+	if (new_state != cp->state) {
+		struct ip_vs_dest *dest = cp->dest;
+
+		IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->"
+			  "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n",
+			  pp->name,
+			  (state_off==TCP_DIR_OUTPUT)?"output ":"input ",
+			  th->syn? 'S' : '.',
+			  th->fin? 'F' : '.',
+			  th->ack? 'A' : '.',
+			  th->rst? 'R' : '.',
+			  NIPQUAD(cp->daddr), ntohs(cp->dport),
+			  NIPQUAD(cp->caddr), ntohs(cp->cport),
+			  tcp_state_name(cp->state),
+			  tcp_state_name(new_state),
+			  atomic_read(&cp->refcnt));
+		if (dest) {
+			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
+			    (new_state != IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_dec(&dest->activeconns);
+				atomic_inc(&dest->inactconns);
+				cp->flags |= IP_VS_CONN_F_INACTIVE;
+			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
+				   (new_state == IP_VS_TCP_S_ESTABLISHED)) {
+				atomic_inc(&dest->activeconns);
+				atomic_dec(&dest->inactconns);
+				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
+			}
+		}
+	}
+
+	cp->timeout = pp->timeout_table[cp->state = new_state];
+}
+
+
+/*
+ *	Handle state transitions
+ */
+static int
+tcp_state_transition(struct ip_vs_conn *cp, int direction,
+		     const struct sk_buff *skb,
+		     struct ip_vs_protocol *pp)
+{
+	struct tcphdr _tcph, *th;
+
+	th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return 0;
+
+	spin_lock(&cp->lock);
+	set_tcp_state(pp, cp, direction, th);
+	spin_unlock(&cp->lock);
+
+	return 1;
+}
+
+
+/*
+ *	Hash table for TCP application incarnations
+ */
+#define	TCP_APP_TAB_BITS	4
+#define	TCP_APP_TAB_SIZE	(1 << TCP_APP_TAB_BITS)
+#define	TCP_APP_TAB_MASK	(TCP_APP_TAB_SIZE - 1)
+
+static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(tcp_app_lock);
+
+static inline __u16 tcp_app_hashkey(__u16 port)
+{
+	return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK;
+}
+
+
+static int tcp_register_app(struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash, port = inc->port;
+	int ret = 0;
+
+	hash = tcp_app_hashkey(port);
+
+	spin_lock_bh(&tcp_app_lock);
+	list_for_each_entry(i, &tcp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &tcp_apps[hash]);
+	atomic_inc(&ip_vs_protocol_tcp.appcnt);
+
+  out:
+	spin_unlock_bh(&tcp_app_lock);
+	return ret;
+}
+
+
+static void
+tcp_unregister_app(struct ip_vs_app *inc)
+{
+	spin_lock_bh(&tcp_app_lock);
+	atomic_dec(&ip_vs_protocol_tcp.appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&tcp_app_lock);
+}
+
+
+static int
+tcp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+
+	/* Lookup application incarnations and bind the right one */
+	hash = tcp_app_hashkey(cp->vport);
+
+	spin_lock(&tcp_app_lock);
+	list_for_each_entry(inc, &tcp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&tcp_app_lock);
+
+			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+				  "%u.%u.%u.%u:%u to app %s on port %u\n",
+				  __FUNCTION__,
+				  NIPQUAD(cp->caddr), ntohs(cp->cport),
+				  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+				  inc->name, ntohs(inc->port));
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&tcp_app_lock);
+
+  out:
+	return result;
+}
+
+
+/*
+ *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
+ */
+void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
+{
+	spin_lock(&cp->lock);
+	cp->state = IP_VS_TCP_S_LISTEN;
+	cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN];
+	spin_unlock(&cp->lock);
+}
+
+
+static void tcp_init(struct ip_vs_protocol *pp)
+{
+	IP_VS_INIT_HASH_TABLE(tcp_apps);
+	pp->timeout_table = tcp_timeouts;
+}
+
+
+static void tcp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_tcp = {
+	.name =			"TCP",
+	.protocol =		IPPROTO_TCP,
+	.dont_defrag =		0,
+	.appcnt =		ATOMIC_INIT(0),
+	.init =			tcp_init,
+	.exit =			tcp_exit,
+	.register_app =		tcp_register_app,
+	.unregister_app =	tcp_unregister_app,
+	.conn_schedule =	tcp_conn_schedule,
+	.conn_in_get =		tcp_conn_in_get,
+	.conn_out_get =		tcp_conn_out_get,
+	.snat_handler =		tcp_snat_handler,
+	.dnat_handler =		tcp_dnat_handler,
+	.csum_check =		tcp_csum_check,
+	.state_name =		tcp_state_name,
+	.state_transition =	tcp_state_transition,
+	.app_conn_bind =	tcp_app_conn_bind,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	tcp_timeout_change,
+	.set_state_timeout =	tcp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c
new file mode 100644
index 000000000000..8ae5f2e0aefa
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_proto_udp.c
@@ -0,0 +1,427 @@
+/*
+ * ip_vs_proto_udp.c:	UDP load balancing support for IPVS
+ *
+ * Version:     $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+static struct ip_vs_conn *
+udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn *cp;
+	__u16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_in_get(iph->protocol,
+				       iph->saddr, pptr[0],
+				       iph->daddr, pptr[1]);
+	} else {
+		cp = ip_vs_conn_in_get(iph->protocol,
+				       iph->daddr, pptr[1],
+				       iph->saddr, pptr[0]);
+	}
+
+	return cp;
+}
+
+
+static struct ip_vs_conn *
+udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
+		 const struct iphdr *iph, unsigned int proto_off, int inverse)
+{
+	struct ip_vs_conn *cp;
+	__u16 _ports[2], *pptr;
+
+	pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				  sizeof(_ports), _ports);
+	if (pptr == NULL)
+		return NULL;
+
+	if (likely(!inverse)) {
+		cp = ip_vs_conn_out_get(iph->protocol,
+					iph->saddr, pptr[0],
+					iph->daddr, pptr[1]);
+	} else {
+		cp = ip_vs_conn_out_get(iph->protocol,
+					iph->daddr, pptr[1],
+					iph->saddr, pptr[0]);
+	}
+
+	return cp;
+}
+
+
+static int
+udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
+		  int *verdict, struct ip_vs_conn **cpp)
+{
+	struct ip_vs_service *svc;
+	struct udphdr _udph, *uh;
+
+	uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		*verdict = NF_DROP;
+		return 0;
+	}
+
+	if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
+				     skb->nh.iph->daddr, uh->dest))) {
+		if (ip_vs_todrop()) {
+			/*
+			 * It seems that we are very loaded.
+			 * We have to drop this packet :(
+			 */
+			ip_vs_service_put(svc);
+			*verdict = NF_DROP;
+			return 0;
+		}
+
+		/*
+		 * Let the virtual server select a real server for the
+		 * incoming connection, and create a connection entry.
+		 */
+		*cpp = ip_vs_schedule(svc, skb);
+		if (!*cpp) {
+			*verdict = ip_vs_leave(svc, skb, pp);
+			return 0;
+		}
+		ip_vs_service_put(svc);
+	}
+	return 1;
+}
+
+
+static inline void
+udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
+		     u16 oldport, u16 newport)
+{
+	uhdr->check =
+		ip_vs_check_diff(~oldip, newip,
+				 ip_vs_check_diff(oldport ^ 0xFFFF,
+						  newport, uhdr->check));
+	if (!uhdr->check)
+		uhdr->check = 0xFFFF;
+}
+
+static int
+udp_snat_handler(struct sk_buff **pskb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct udphdr *udph;
+	unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+
+	/* csum_check requires unshared skb */
+	if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(*pskb, pp))
+			return 0;
+
+		/*
+		 *	Call application helper if needed
+		 */
+		if (!ip_vs_app_pkt_out(cp, pskb))
+			return 0;
+	}
+
+	udph = (void *)(*pskb)->nh.iph + udphoff;
+	udph->source = cp->vport;
+
+	/*
+	 *	Adjust UDP checksums
+	 */
+	if (!cp->app && (udph->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
+				     cp->dport, cp->vport);
+		if ((*pskb)->ip_summed == CHECKSUM_HW)
+			(*pskb)->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		udph->check = 0;
+		(*pskb)->csum = skb_checksum(*pskb, udphoff,
+					     (*pskb)->len - udphoff, 0);
+		udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
+						(*pskb)->len - udphoff,
+						cp->protocol,
+						(*pskb)->csum);
+		if (udph->check == 0)
+			udph->check = 0xFFFF;
+		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
+			  pp->name, udph->check,
+			  (char*)&(udph->check) - (char*)udph);
+	}
+	return 1;
+}
+
+
+static int
+udp_dnat_handler(struct sk_buff **pskb,
+		 struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
+{
+	struct udphdr *udph;
+	unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
+
+	/* csum_check requires unshared skb */
+	if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
+		return 0;
+
+	if (unlikely(cp->app != NULL)) {
+		/* Some checks before mangling */
+		if (pp->csum_check && !pp->csum_check(*pskb, pp))
+			return 0;
+
+		/*
+		 *	Attempt ip_vs_app call.
+		 *	It will fix ip_vs_conn
+		 */
+		if (!ip_vs_app_pkt_in(cp, pskb))
+			return 0;
+	}
+
+	udph = (void *)(*pskb)->nh.iph + udphoff;
+	udph->dest = cp->dport;
+
+	/*
+	 *	Adjust UDP checksums
+	 */
+	if (!cp->app && (udph->check != 0)) {
+		/* Only port and addr are changed, do fast csum update */
+		udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
+				     cp->vport, cp->dport);
+		if ((*pskb)->ip_summed == CHECKSUM_HW)
+			(*pskb)->ip_summed = CHECKSUM_NONE;
+	} else {
+		/* full checksum calculation */
+		udph->check = 0;
+		(*pskb)->csum = skb_checksum(*pskb, udphoff,
+					     (*pskb)->len - udphoff, 0);
+		udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
+						(*pskb)->len - udphoff,
+						cp->protocol,
+						(*pskb)->csum);
+		if (udph->check == 0)
+			udph->check = 0xFFFF;
+		(*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+	return 1;
+}
+
+
+static int
+udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
+{
+	struct udphdr _udph, *uh;
+	unsigned int udphoff = skb->nh.iph->ihl*4;
+
+	uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph);
+	if (uh == NULL)
+		return 0;
+
+	if (uh->check != 0) {
+		switch (skb->ip_summed) {
+		case CHECKSUM_NONE:
+			skb->csum = skb_checksum(skb, udphoff,
+						 skb->len - udphoff, 0);
+		case CHECKSUM_HW:
+			if (csum_tcpudp_magic(skb->nh.iph->saddr,
+					      skb->nh.iph->daddr,
+					      skb->len - udphoff,
+					      skb->nh.iph->protocol,
+					      skb->csum)) {
+				IP_VS_DBG_RL_PKT(0, pp, skb, 0,
+						 "Failed checksum for");
+				return 0;
+			}
+			break;
+		default:
+			/* CHECKSUM_UNNECESSARY */
+			break;
+		}
+	}
+	return 1;
+}
+
+
+/*
+ *	Note: the caller guarantees that only one of register_app,
+ *	unregister_app or app_conn_bind is called each time.
+ */
+
+#define	UDP_APP_TAB_BITS	4
+#define	UDP_APP_TAB_SIZE	(1 << UDP_APP_TAB_BITS)
+#define	UDP_APP_TAB_MASK	(UDP_APP_TAB_SIZE - 1)
+
+static struct list_head udp_apps[UDP_APP_TAB_SIZE];
+static DEFINE_SPINLOCK(udp_app_lock);
+
+static inline __u16 udp_app_hashkey(__u16 port)
+{
+	return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK;
+}
+
+
+static int udp_register_app(struct ip_vs_app *inc)
+{
+	struct ip_vs_app *i;
+	__u16 hash, port = inc->port;
+	int ret = 0;
+
+	hash = udp_app_hashkey(port);
+
+
+	spin_lock_bh(&udp_app_lock);
+	list_for_each_entry(i, &udp_apps[hash], p_list) {
+		if (i->port == port) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+	list_add(&inc->p_list, &udp_apps[hash]);
+	atomic_inc(&ip_vs_protocol_udp.appcnt);
+
+  out:
+	spin_unlock_bh(&udp_app_lock);
+	return ret;
+}
+
+
+static void
+udp_unregister_app(struct ip_vs_app *inc)
+{
+	spin_lock_bh(&udp_app_lock);
+	atomic_dec(&ip_vs_protocol_udp.appcnt);
+	list_del(&inc->p_list);
+	spin_unlock_bh(&udp_app_lock);
+}
+
+
+static int udp_app_conn_bind(struct ip_vs_conn *cp)
+{
+	int hash;
+	struct ip_vs_app *inc;
+	int result = 0;
+
+	/* Default binding: bind app only for NAT */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+		return 0;
+
+	/* Lookup application incarnations and bind the right one */
+	hash = udp_app_hashkey(cp->vport);
+
+	spin_lock(&udp_app_lock);
+	list_for_each_entry(inc, &udp_apps[hash], p_list) {
+		if (inc->port == cp->vport) {
+			if (unlikely(!ip_vs_app_inc_get(inc)))
+				break;
+			spin_unlock(&udp_app_lock);
+
+			IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->"
+				  "%u.%u.%u.%u:%u to app %s on port %u\n",
+				  __FUNCTION__,
+				  NIPQUAD(cp->caddr), ntohs(cp->cport),
+				  NIPQUAD(cp->vaddr), ntohs(cp->vport),
+				  inc->name, ntohs(inc->port));
+			cp->app = inc;
+			if (inc->init_conn)
+				result = inc->init_conn(inc, cp);
+			goto out;
+		}
+	}
+	spin_unlock(&udp_app_lock);
+
+  out:
+	return result;
+}
+
+
+static int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
+	[IP_VS_UDP_S_NORMAL]		=	5*60*HZ,
+	[IP_VS_UDP_S_LAST]		=	2*HZ,
+};
+
+static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
+	[IP_VS_UDP_S_NORMAL]		=	"UDP",
+	[IP_VS_UDP_S_LAST]		=	"BUG!",
+};
+
+
+static int
+udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
+{
+	return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
+				       udp_state_name_table, sname, to);
+}
+
+static const char * udp_state_name(int state)
+{
+	if (state >= IP_VS_UDP_S_LAST)
+		return "ERR!";
+	return udp_state_name_table[state] ? udp_state_name_table[state] : "?";
+}
+
+static int
+udp_state_transition(struct ip_vs_conn *cp, int direction,
+		     const struct sk_buff *skb,
+		     struct ip_vs_protocol *pp)
+{
+	cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
+	return 1;
+}
+
+static void udp_init(struct ip_vs_protocol *pp)
+{
+	IP_VS_INIT_HASH_TABLE(udp_apps);
+	pp->timeout_table = udp_timeouts;
+}
+
+static void udp_exit(struct ip_vs_protocol *pp)
+{
+}
+
+
+struct ip_vs_protocol ip_vs_protocol_udp = {
+	.name =			"UDP",
+	.protocol =		IPPROTO_UDP,
+	.dont_defrag =		0,
+	.init =			udp_init,
+	.exit =			udp_exit,
+	.conn_schedule =	udp_conn_schedule,
+	.conn_in_get =		udp_conn_in_get,
+	.conn_out_get =		udp_conn_out_get,
+	.snat_handler =		udp_snat_handler,
+	.dnat_handler =		udp_dnat_handler,
+	.csum_check =		udp_csum_check,
+	.state_transition =	udp_state_transition,
+	.state_name =		udp_state_name,
+	.register_app =		udp_register_app,
+	.unregister_app =	udp_unregister_app,
+	.app_conn_bind =	udp_app_conn_bind,
+	.debug_packet =		ip_vs_tcpudp_debug_packet,
+	.timeout_change =	NULL,
+	.set_state_timeout =	udp_set_state_timeout,
+};
diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c
new file mode 100644
index 000000000000..b23bab231cab
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_rr.c
@@ -0,0 +1,118 @@
+/*
+ * IPVS:        Round-Robin Scheduling module
+ *
+ * Version:     $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Fixes/Changes:
+ *     Wensong Zhang            :     changed the ip_vs_rr_schedule to return dest
+ *     Julian Anastasov         :     fixed the NULL pointer access bug in debugging
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_rr_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int ip_vs_rr_init_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+
+static int ip_vs_rr_done_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
+{
+	svc->sched_data = &svc->destinations;
+	return 0;
+}
+
+
+/*
+ * Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct list_head *p, *q;
+	struct ip_vs_dest *dest;
+
+	IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n");
+
+	write_lock(&svc->sched_lock);
+	p = (struct list_head *)svc->sched_data;
+	p = p->next;
+	q = p;
+	do {
+		/* skip list head */
+		if (q == &svc->destinations) {
+			q = q->next;
+			continue;
+		}
+		
+		dest = list_entry(q, struct ip_vs_dest, n_list);
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0)
+			/* HIT */
+			goto out;
+		q = q->next;
+	} while (q != p);
+	write_unlock(&svc->sched_lock);
+	return NULL;
+
+  out:
+	svc->sched_data = q;
+	write_unlock(&svc->sched_lock);
+	IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u "
+		  "activeconns %d refcnt %d weight %d\n",
+		  NIPQUAD(dest->addr), ntohs(dest->port),
+		  atomic_read(&dest->activeconns),
+		  atomic_read(&dest->refcnt), atomic_read(&dest->weight));
+
+	return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_rr_scheduler = {
+	.name =			"rr",			/* name */
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_rr_init_svc,
+	.done_service =		ip_vs_rr_done_svc,
+	.update_service =	ip_vs_rr_update_svc,
+	.schedule =		ip_vs_rr_schedule,
+};
+
+static int __init ip_vs_rr_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+static void __exit ip_vs_rr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_rr_scheduler);
+}
+
+module_init(ip_vs_rr_init);
+module_exit(ip_vs_rr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c
new file mode 100644
index 000000000000..0f7c56a225bd
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sched.c
@@ -0,0 +1,251 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the Netfilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <asm/string.h>
+#include <linux/kmod.h>
+
+#include <net/ip_vs.h>
+
+/*
+ *  IPVS scheduler list
+ */
+static LIST_HEAD(ip_vs_schedulers);
+
+/* lock for service table */
+static DEFINE_RWLOCK(__ip_vs_sched_lock);
+
+
+/*
+ *  Bind a service with a scheduler
+ */
+int ip_vs_bind_scheduler(struct ip_vs_service *svc,
+			 struct ip_vs_scheduler *scheduler)
+{
+	int ret;
+
+	if (svc == NULL) {
+		IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n");
+		return -EINVAL;
+	}
+	if (scheduler == NULL) {
+		IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n");
+		return -EINVAL;
+	}
+
+	svc->scheduler = scheduler;
+
+	if (scheduler->init_service) {
+		ret = scheduler->init_service(svc);
+		if (ret) {
+			IP_VS_ERR("ip_vs_bind_scheduler(): init error\n");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+
+/*
+ *  Unbind a service with its scheduler
+ */
+int ip_vs_unbind_scheduler(struct ip_vs_service *svc)
+{
+	struct ip_vs_scheduler *sched;
+
+	if (svc == NULL) {
+		IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n");
+		return -EINVAL;
+	}
+
+	sched = svc->scheduler;
+	if (sched == NULL) {
+		IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n");
+		return -EINVAL;
+	}
+
+	if (sched->done_service) {
+		if (sched->done_service(svc) != 0) {
+			IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n");
+			return -EINVAL;
+		}
+	}
+
+	svc->scheduler = NULL;
+	return 0;
+}
+
+
+/*
+ *  Get scheduler in the scheduler list by name
+ */
+static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name)
+{
+	struct ip_vs_scheduler *sched;
+
+	IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n",
+		  sched_name);
+
+	read_lock_bh(&__ip_vs_sched_lock);
+
+	list_for_each_entry(sched, &ip_vs_schedulers, n_list) {
+		/*
+		 * Test and get the modules atomically
+		 */
+		if (sched->module && !try_module_get(sched->module)) {
+			/*
+			 * This scheduler is just deleted
+			 */
+			continue;
+		}
+		if (strcmp(sched_name, sched->name)==0) {
+			/* HIT */
+			read_unlock_bh(&__ip_vs_sched_lock);
+			return sched;
+		}
+		if (sched->module)
+			module_put(sched->module);
+	}
+
+	read_unlock_bh(&__ip_vs_sched_lock);
+	return NULL;
+}
+
+
+/*
+ *  Lookup scheduler and try to load it if it doesn't exist
+ */
+struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name)
+{
+	struct ip_vs_scheduler *sched;
+
+	/*
+	 *  Search for the scheduler by sched_name
+	 */
+	sched = ip_vs_sched_getbyname(sched_name);
+
+	/*
+	 *  If scheduler not found, load the module and search again
+	 */
+	if (sched == NULL) {
+		request_module("ip_vs_%s", sched_name);
+		sched = ip_vs_sched_getbyname(sched_name);
+	}
+
+	return sched;
+}
+
+void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler)
+{
+	if (scheduler->module)
+		module_put(scheduler->module);
+}
+
+
+/*
+ *  Register a scheduler in the scheduler list
+ */
+int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+	struct ip_vs_scheduler *sched;
+
+	if (!scheduler) {
+		IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n");
+		return -EINVAL;
+	}
+
+	if (!scheduler->name) {
+		IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n");
+		return -EINVAL;
+	}
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	/*
+	 *  Make sure that the scheduler with this name doesn't exist
+	 *  in the scheduler list.
+	 */
+	sched = ip_vs_sched_getbyname(scheduler->name);
+	if (sched) {
+		ip_vs_scheduler_put(sched);
+		ip_vs_use_count_dec();
+		IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+			  "already existed in the system\n", scheduler->name);
+		return -EINVAL;
+	}
+
+	write_lock_bh(&__ip_vs_sched_lock);
+
+	if (scheduler->n_list.next != &scheduler->n_list) {
+		write_unlock_bh(&__ip_vs_sched_lock);
+		ip_vs_use_count_dec();
+		IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler "
+			  "already linked\n", scheduler->name);
+		return -EINVAL;
+	}
+
+	/*
+	 *	Add it into the d-linked scheduler list
+	 */
+	list_add(&scheduler->n_list, &ip_vs_schedulers);
+	write_unlock_bh(&__ip_vs_sched_lock);
+
+	IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name);
+
+	return 0;
+}
+
+
+/*
+ *  Unregister a scheduler from the scheduler list
+ */
+int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler)
+{
+	if (!scheduler) {
+		IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n");
+		return -EINVAL;
+	}
+
+	write_lock_bh(&__ip_vs_sched_lock);
+	if (scheduler->n_list.next == &scheduler->n_list) {
+		write_unlock_bh(&__ip_vs_sched_lock);
+		IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler "
+			  "is not in the list. failed\n", scheduler->name);
+		return -EINVAL;
+	}
+
+	/*
+	 *	Remove it from the d-linked scheduler list
+	 */
+	list_del(&scheduler->n_list);
+	write_unlock_bh(&__ip_vs_sched_lock);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name);
+
+	return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c
new file mode 100644
index 000000000000..ff366f7390d9
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sed.c
@@ -0,0 +1,163 @@
+/*
+ * IPVS:        Shortest Expected Delay scheduling module
+ *
+ * Version:     $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The SED algorithm attempts to minimize each job's expected delay until
+ * completion. The expected delay that the job will experience is
+ * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of
+ * jobs on the the ith server and Ui is the fixed service rate (weight) of
+ * the ith server. The SED algorithm adopts a greedy policy that each does
+ * what is in its own best interest, i.e. to join the queue which would
+ * minimize its expected delay of completion.
+ *
+ * See the following paper for more information:
+ * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing
+ * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88,
+ * pages 986-994, 1988.
+ *
+ * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me.
+ *
+ * The difference between SED and WLC is that SED includes the incoming
+ * job in the cost function (the increment of 1). SED may outperform
+ * WLC, while scheduling big jobs under larger heterogeneous systems
+ * (the server weight varies a lot).
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_sed_init_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_sed_done_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_sed_update_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static inline unsigned int
+ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We only use the active connection number in the cost
+	 * calculation here.
+	 */
+	return atomic_read(&dest->activeconns) + 1;
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least;
+	unsigned int loh, doh;
+
+	IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *	(server expected overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_sed_dest_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		doh = ip_vs_sed_dest_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_sed_scheduler =
+{
+	.name =			"sed",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_sed_init_svc,
+	.done_service =		ip_vs_sed_done_svc,
+	.update_service =	ip_vs_sed_update_svc,
+	.schedule =		ip_vs_sed_schedule,
+};
+
+
+static int __init ip_vs_sed_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+static void __exit ip_vs_sed_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_sed_scheduler);
+}
+
+module_init(ip_vs_sed_init);
+module_exit(ip_vs_sed_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c
new file mode 100644
index 000000000000..6f7c50e44a39
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sh.c
@@ -0,0 +1,255 @@
+/*
+ * IPVS:        Source Hashing scheduling module
+ *
+ * Version:     $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@gnuchina.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+/*
+ * The sh algorithm is to select server by the hash key of source IP
+ * address. The pseudo code is as follows:
+ *
+ *       n <- servernode[src_ip];
+ *       if (n is dead) OR
+ *          (n is overloaded) or (n.weight <= 0) then
+ *                 return NULL;
+ *
+ *       return n;
+ *
+ * Notes that servernode is a 256-bucket hash table that maps the hash
+ * index derived from packet source IP address to the current server
+ * array. If the sh scheduler is used in cache cluster, it is good to
+ * combine it with cache_bypass feature. When the statically assigned
+ * server is dead or overloaded, the load balancer can bypass the cache
+ * server and send requests to the original server directly.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      IPVS SH bucket
+ */
+struct ip_vs_sh_bucket {
+	struct ip_vs_dest       *dest;          /* real server (cache) */
+};
+
+/*
+ *     for IPVS SH entry hash table
+ */
+#ifndef CONFIG_IP_VS_SH_TAB_BITS
+#define CONFIG_IP_VS_SH_TAB_BITS        8
+#endif
+#define IP_VS_SH_TAB_BITS               CONFIG_IP_VS_SH_TAB_BITS
+#define IP_VS_SH_TAB_SIZE               (1 << IP_VS_SH_TAB_BITS)
+#define IP_VS_SH_TAB_MASK               (IP_VS_SH_TAB_SIZE - 1)
+
+
+/*
+ *	Returns hash value for IPVS SH entry
+ */
+static inline unsigned ip_vs_sh_hashkey(__u32 addr)
+{
+	return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK;
+}
+
+
+/*
+ *      Get ip_vs_dest associated with supplied parameters.
+ */
+static inline struct ip_vs_dest *
+ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr)
+{
+	return (tbl[ip_vs_sh_hashkey(addr)]).dest;
+}
+
+
+/*
+ *      Assign all the hash buckets of the specified table with the service.
+ */
+static int
+ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc)
+{
+	int i;
+	struct ip_vs_sh_bucket *b;
+	struct list_head *p;
+	struct ip_vs_dest *dest;
+
+	b = tbl;
+	p = &svc->destinations;
+	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+		if (list_empty(p)) {
+			b->dest = NULL;
+		} else {
+			if (p == &svc->destinations)
+				p = p->next;
+
+			dest = list_entry(p, struct ip_vs_dest, n_list);
+			atomic_inc(&dest->refcnt);
+			b->dest = dest;
+
+			p = p->next;
+		}
+		b++;
+	}
+	return 0;
+}
+
+
+/*
+ *      Flush all the hash buckets of the specified table.
+ */
+static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl)
+{
+	int i;
+	struct ip_vs_sh_bucket *b;
+
+	b = tbl;
+	for (i=0; i<IP_VS_SH_TAB_SIZE; i++) {
+		if (b->dest) {
+			atomic_dec(&b->dest->refcnt);
+			b->dest = NULL;
+		}
+		b++;
+	}
+}
+
+
+static int ip_vs_sh_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl;
+
+	/* allocate the SH table for this service */
+	tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE,
+		      GFP_ATOMIC);
+	if (tbl == NULL) {
+		IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	svc->sched_data = tbl;
+	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for "
+		  "current service\n",
+		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_sh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+static int ip_vs_sh_done_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_sh_flush(tbl);
+
+	/* release the table itself */
+	kfree(svc->sched_data);
+	IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n",
+		  sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE);
+
+	return 0;
+}
+
+
+static int ip_vs_sh_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_sh_bucket *tbl = svc->sched_data;
+
+	/* got to clean up hash buckets here */
+	ip_vs_sh_flush(tbl);
+
+	/* assign the hash buckets with the updated service */
+	ip_vs_sh_assign(tbl, svc);
+
+	return 0;
+}
+
+
+/*
+ *      If the dest flags is set with IP_VS_DEST_F_OVERLOAD,
+ *      consider that the server is overloaded here.
+ */
+static inline int is_overloaded(struct ip_vs_dest *dest)
+{
+	return dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+
+/*
+ *      Source Hashing scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_sh_bucket *tbl;
+	struct iphdr *iph = skb->nh.iph;
+
+	IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
+
+	tbl = (struct ip_vs_sh_bucket *)svc->sched_data;
+	dest = ip_vs_sh_get(tbl, iph->saddr);
+	if (!dest
+	    || !(dest->flags & IP_VS_DEST_F_AVAILABLE)
+	    || atomic_read(&dest->weight) <= 0
+	    || is_overloaded(dest)) {
+		return NULL;
+	}
+
+	IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u "
+		  "--> server %u.%u.%u.%u:%d\n",
+		  NIPQUAD(iph->saddr),
+		  NIPQUAD(dest->addr),
+		  ntohs(dest->port));
+
+	return dest;
+}
+
+
+/*
+ *      IPVS SH Scheduler structure
+ */
+static struct ip_vs_scheduler ip_vs_sh_scheduler =
+{
+	.name =			"sh",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_sh_init_svc,
+	.done_service =		ip_vs_sh_done_svc,
+	.update_service =	ip_vs_sh_update_svc,
+	.schedule =		ip_vs_sh_schedule,
+};
+
+
+static int __init ip_vs_sh_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+static void __exit ip_vs_sh_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_sh_scheduler);
+}
+
+
+module_init(ip_vs_sh_init);
+module_exit(ip_vs_sh_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c
new file mode 100644
index 000000000000..25c479550a32
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_sync.c
@@ -0,0 +1,892 @@
+/*
+ * IPVS         An implementation of the IP virtual server support for the
+ *              LINUX operating system.  IPVS is now implemented as a module
+ *              over the NetFilter framework. IPVS can be used to build a
+ *              high-performance and highly available server based on a
+ *              cluster of servers.
+ *
+ * Version:     $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ * ip_vs_sync:  sync connection info from master load balancer to backups
+ *              through multicast
+ *
+ * Changes:
+ *	Alexandre Cassen	:	Added master & backup support at a time.
+ *	Alexandre Cassen	:	Added SyncID support for incoming sync
+ *					messages filtering.
+ *	Justin Ossevoort	:	Fix endian problem on sync message size.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/net.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/igmp.h>                 /* for ip_mc_join_group */
+
+#include <net/ip.h>
+#include <net/sock.h>
+#include <asm/uaccess.h>                /* for get_fs and set_fs */
+
+#include <net/ip_vs.h>
+
+#define IP_VS_SYNC_GROUP 0xe0000051    /* multicast addr - 224.0.0.81 */
+#define IP_VS_SYNC_PORT  8848          /* multicast port */
+
+
+/*
+ *	IPVS sync connection entry
+ */
+struct ip_vs_sync_conn {
+	__u8			reserved;
+
+	/* Protocol, addresses and port numbers */
+	__u8			protocol;       /* Which protocol (TCP/UDP) */
+	__u16			cport;
+	__u16                   vport;
+	__u16                   dport;
+	__u32                   caddr;          /* client address */
+	__u32                   vaddr;          /* virtual address */
+	__u32                   daddr;          /* destination address */
+
+	/* Flags and state transition */
+	__u16                   flags;          /* status flags */
+	__u16                   state;          /* state info */
+
+	/* The sequence options start here */
+};
+
+struct ip_vs_sync_conn_options {
+	struct ip_vs_seq        in_seq;         /* incoming seq. struct */
+	struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
+};
+
+#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ)
+#define SIMPLE_CONN_SIZE  (sizeof(struct ip_vs_sync_conn))
+#define FULL_CONN_SIZE  \
+(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options))
+
+
+/*
+  The master mulitcasts messages to the backup load balancers in the
+  following format.
+
+       0                   1                   2                   3
+       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |  Count Conns  |    SyncID     |            Size               |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (1)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                            .                                  |
+      |                            .                                  |
+      |                            .                                  |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+      |                                                               |
+      |                    IPVS Sync Connection (n)                   |
+      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+*/
+
+#define SYNC_MESG_HEADER_LEN	4
+
+struct ip_vs_sync_mesg {
+	__u8                    nr_conns;
+	__u8                    syncid;
+	__u16                   size;
+
+	/* ip_vs_sync_conn entries start here */
+};
+
+/* the maximum length of sync (sending/receiving) message */
+static int sync_send_mesg_maxlen;
+static int sync_recv_mesg_maxlen;
+
+struct ip_vs_sync_buff {
+	struct list_head        list;
+	unsigned long           firstuse;
+
+	/* pointers for the message data */
+	struct ip_vs_sync_mesg  *mesg;
+	unsigned char           *head;
+	unsigned char           *end;
+};
+
+
+/* the sync_buff list head and the lock */
+static LIST_HEAD(ip_vs_sync_queue);
+static DEFINE_SPINLOCK(ip_vs_sync_lock);
+
+/* current sync_buff for accepting new conn entries */
+static struct ip_vs_sync_buff   *curr_sb = NULL;
+static DEFINE_SPINLOCK(curr_sb_lock);
+
+/* ipvs sync daemon state */
+volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
+volatile int ip_vs_master_syncid = 0;
+volatile int ip_vs_backup_syncid = 0;
+
+/* multicast interface name */
+char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
+
+/* multicast addr */
+static struct sockaddr_in mcast_addr;
+
+
+static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
+{
+	spin_lock(&ip_vs_sync_lock);
+	list_add_tail(&sb->list, &ip_vs_sync_queue);
+	spin_unlock(&ip_vs_sync_lock);
+}
+
+static inline struct ip_vs_sync_buff * sb_dequeue(void)
+{
+	struct ip_vs_sync_buff *sb;
+
+	spin_lock_bh(&ip_vs_sync_lock);
+	if (list_empty(&ip_vs_sync_queue)) {
+		sb = NULL;
+	} else {
+		sb = list_entry(ip_vs_sync_queue.next,
+				struct ip_vs_sync_buff,
+				list);
+		list_del(&sb->list);
+	}
+	spin_unlock_bh(&ip_vs_sync_lock);
+
+	return sb;
+}
+
+static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void)
+{
+	struct ip_vs_sync_buff *sb;
+
+	if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
+		return NULL;
+
+	if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) {
+		kfree(sb);
+		return NULL;
+	}
+	sb->mesg->nr_conns = 0;
+	sb->mesg->syncid = ip_vs_master_syncid;
+	sb->mesg->size = 4;
+	sb->head = (unsigned char *)sb->mesg + 4;
+	sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen;
+	sb->firstuse = jiffies;
+	return sb;
+}
+
+static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
+{
+	kfree(sb->mesg);
+	kfree(sb);
+}
+
+/*
+ *	Get the current sync buffer if it has been created for more
+ *	than the specified time or the specified time is zero.
+ */
+static inline struct ip_vs_sync_buff *
+get_curr_sync_buff(unsigned long time)
+{
+	struct ip_vs_sync_buff *sb;
+
+	spin_lock_bh(&curr_sb_lock);
+	if (curr_sb && (time == 0 ||
+			time_before(jiffies - curr_sb->firstuse, time))) {
+		sb = curr_sb;
+		curr_sb = NULL;
+	} else
+		sb = NULL;
+	spin_unlock_bh(&curr_sb_lock);
+	return sb;
+}
+
+
+/*
+ *      Add an ip_vs_conn information into the current sync_buff.
+ *      Called by ip_vs_in.
+ */
+void ip_vs_sync_conn(struct ip_vs_conn *cp)
+{
+	struct ip_vs_sync_mesg *m;
+	struct ip_vs_sync_conn *s;
+	int len;
+
+	spin_lock(&curr_sb_lock);
+	if (!curr_sb) {
+		if (!(curr_sb=ip_vs_sync_buff_create())) {
+			spin_unlock(&curr_sb_lock);
+			IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
+			return;
+		}
+	}
+
+	len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
+		SIMPLE_CONN_SIZE;
+	m = curr_sb->mesg;
+	s = (struct ip_vs_sync_conn *)curr_sb->head;
+
+	/* copy members */
+	s->protocol = cp->protocol;
+	s->cport = cp->cport;
+	s->vport = cp->vport;
+	s->dport = cp->dport;
+	s->caddr = cp->caddr;
+	s->vaddr = cp->vaddr;
+	s->daddr = cp->daddr;
+	s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
+	s->state = htons(cp->state);
+	if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
+		struct ip_vs_sync_conn_options *opt =
+			(struct ip_vs_sync_conn_options *)&s[1];
+		memcpy(opt, &cp->in_seq, sizeof(*opt));
+	}
+
+	m->nr_conns++;
+	m->size += len;
+	curr_sb->head += len;
+
+	/* check if there is a space for next one */
+	if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) {
+		sb_queue_tail(curr_sb);
+		curr_sb = NULL;
+	}
+	spin_unlock(&curr_sb_lock);
+
+	/* synchronize its controller if it has */
+	if (cp->control)
+		ip_vs_sync_conn(cp->control);
+}
+
+
+/*
+ *      Process received multicast message and create the corresponding
+ *      ip_vs_conn entries.
+ */
+static void ip_vs_process_message(const char *buffer, const size_t buflen)
+{
+	struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
+	struct ip_vs_sync_conn *s;
+	struct ip_vs_sync_conn_options *opt;
+	struct ip_vs_conn *cp;
+	char *p;
+	int i;
+
+	/* Convert size back to host byte order */
+	m->size = ntohs(m->size);
+
+	if (buflen != m->size) {
+		IP_VS_ERR("bogus message\n");
+		return;
+	}
+
+	/* SyncID sanity check */
+	if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
+		IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n",
+			  m->syncid);
+		return;
+	}
+
+	p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);
+	for (i=0; i<m->nr_conns; i++) {
+		s = (struct ip_vs_sync_conn *)p;
+		cp = ip_vs_conn_in_get(s->protocol,
+				       s->caddr, s->cport,
+				       s->vaddr, s->vport);
+		if (!cp) {
+			cp = ip_vs_conn_new(s->protocol,
+					    s->caddr, s->cport,
+					    s->vaddr, s->vport,
+					    s->daddr, s->dport,
+					    ntohs(s->flags), NULL);
+			if (!cp) {
+				IP_VS_ERR("ip_vs_conn_new failed\n");
+				return;
+			}
+			cp->state = ntohs(s->state);
+		} else if (!cp->dest) {
+			/* it is an entry created by the synchronization */
+			cp->state = ntohs(s->state);
+			cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED;
+		}	/* Note that we don't touch its state and flags
+			   if it is a normal entry. */
+
+		if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) {
+			opt = (struct ip_vs_sync_conn_options *)&s[1];
+			memcpy(&cp->in_seq, opt, sizeof(*opt));
+			p += FULL_CONN_SIZE;
+		} else
+			p += SIMPLE_CONN_SIZE;
+
+		atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
+		cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
+		ip_vs_conn_put(cp);
+
+		if (p > buffer+buflen) {
+			IP_VS_ERR("bogus message\n");
+			return;
+		}
+	}
+}
+
+
+/*
+ *      Setup loopback of outgoing multicasts on a sending socket
+ */
+static void set_mcast_loop(struct sock *sk, u_char loop)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */
+	lock_sock(sk);
+	inet->mc_loop = loop ? 1 : 0;
+	release_sock(sk);
+}
+
+/*
+ *      Specify TTL for outgoing multicasts on a sending socket
+ */
+static void set_mcast_ttl(struct sock *sk, u_char ttl)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	/* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */
+	lock_sock(sk);
+	inet->mc_ttl = ttl;
+	release_sock(sk);
+}
+
+/*
+ *      Specifiy default interface for outgoing multicasts
+ */
+static int set_mcast_if(struct sock *sk, char *ifname)
+{
+	struct net_device *dev;
+	struct inet_sock *inet = inet_sk(sk);
+
+	if ((dev = __dev_get_by_name(ifname)) == NULL)
+		return -ENODEV;
+
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	lock_sock(sk);
+	inet->mc_index = dev->ifindex;
+	/*  inet->mc_addr  = 0; */
+	release_sock(sk);
+
+	return 0;
+}
+
+
+/*
+ *	Set the maximum length of sync message according to the
+ *	specified interface's MTU.
+ */
+static int set_sync_mesg_maxlen(int sync_state)
+{
+	struct net_device *dev;
+	int num;
+
+	if (sync_state == IP_VS_STATE_MASTER) {
+		if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL)
+			return -ENODEV;
+
+		num = (dev->mtu - sizeof(struct iphdr) -
+		       sizeof(struct udphdr) -
+		       SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
+		sync_send_mesg_maxlen =
+			SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num;
+		IP_VS_DBG(7, "setting the maximum length of sync sending "
+			  "message %d.\n", sync_send_mesg_maxlen);
+	} else if (sync_state == IP_VS_STATE_BACKUP) {
+		if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL)
+			return -ENODEV;
+
+		sync_recv_mesg_maxlen = dev->mtu -
+			sizeof(struct iphdr) - sizeof(struct udphdr);
+		IP_VS_DBG(7, "setting the maximum length of sync receiving "
+			  "message %d.\n", sync_recv_mesg_maxlen);
+	}
+
+	return 0;
+}
+
+
+/*
+ *      Join a multicast group.
+ *      the group is specified by a class D multicast address 224.0.0.0/8
+ *      in the in_addr structure passed in as a parameter.
+ */
+static int
+join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
+{
+	struct ip_mreqn mreq;
+	struct net_device *dev;
+	int ret;
+
+	memset(&mreq, 0, sizeof(mreq));
+	memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
+
+	if ((dev = __dev_get_by_name(ifname)) == NULL)
+		return -ENODEV;
+	if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
+		return -EINVAL;
+
+	mreq.imr_ifindex = dev->ifindex;
+
+	lock_sock(sk);
+	ret = ip_mc_join_group(sk, &mreq);
+	release_sock(sk);
+
+	return ret;
+}
+
+
+static int bind_mcastif_addr(struct socket *sock, char *ifname)
+{
+	struct net_device *dev;
+	u32 addr;
+	struct sockaddr_in sin;
+
+	if ((dev = __dev_get_by_name(ifname)) == NULL)
+		return -ENODEV;
+
+	addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	if (!addr)
+		IP_VS_ERR("You probably need to specify IP address on "
+			  "multicast interface.\n");
+
+	IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n",
+		  ifname, NIPQUAD(addr));
+
+	/* Now bind the socket with the address of multicast interface */
+	sin.sin_family	     = AF_INET;
+	sin.sin_addr.s_addr  = addr;
+	sin.sin_port         = 0;
+
+	return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin));
+}
+
+/*
+ *      Set up sending multicast socket over UDP
+ */
+static struct socket * make_send_sock(void)
+{
+	struct socket *sock;
+
+	/* First create a socket */
+	if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+		IP_VS_ERR("Error during creation of socket; terminating\n");
+		return NULL;
+	}
+
+	if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) {
+		IP_VS_ERR("Error setting outbound mcast interface\n");
+		goto error;
+	}
+
+	set_mcast_loop(sock->sk, 0);
+	set_mcast_ttl(sock->sk, 1);
+
+	if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) {
+		IP_VS_ERR("Error binding address of the mcast interface\n");
+		goto error;
+	}
+
+	if (sock->ops->connect(sock,
+			       (struct sockaddr*)&mcast_addr,
+			       sizeof(struct sockaddr), 0) < 0) {
+		IP_VS_ERR("Error connecting to the multicast addr\n");
+		goto error;
+	}
+
+	return sock;
+
+  error:
+	sock_release(sock);
+	return NULL;
+}
+
+
+/*
+ *      Set up receiving multicast socket over UDP
+ */
+static struct socket * make_receive_sock(void)
+{
+	struct socket *sock;
+
+	/* First create a socket */
+	if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) {
+		IP_VS_ERR("Error during creation of socket; terminating\n");
+		return NULL;
+	}
+
+	/* it is equivalent to the REUSEADDR option in user-space */
+	sock->sk->sk_reuse = 1;
+
+	if (sock->ops->bind(sock,
+			    (struct sockaddr*)&mcast_addr,
+			    sizeof(struct sockaddr)) < 0) {
+		IP_VS_ERR("Error binding to the multicast addr\n");
+		goto error;
+	}
+
+	/* join the multicast group */
+	if (join_mcast_group(sock->sk,
+			     (struct in_addr*)&mcast_addr.sin_addr,
+			     ip_vs_backup_mcast_ifn) < 0) {
+		IP_VS_ERR("Error joining to the multicast group\n");
+		goto error;
+	}
+
+	return sock;
+
+  error:
+	sock_release(sock);
+	return NULL;
+}
+
+
+static int
+ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
+{
+	struct msghdr	msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL};
+	struct kvec	iov;
+	int		len;
+
+	EnterFunction(7);
+	iov.iov_base     = (void *)buffer;
+	iov.iov_len      = length;
+
+	len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length));
+
+	LeaveFunction(7);
+	return len;
+}
+
+static void
+ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
+{
+	int msize;
+
+	msize = msg->size;
+
+	/* Put size in network byte order */
+	msg->size = htons(msg->size);
+
+	if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
+		IP_VS_ERR("ip_vs_send_async error\n");
+}
+
+static int
+ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
+{
+	struct msghdr		msg = {NULL,};
+	struct kvec		iov;
+	int			len;
+
+	EnterFunction(7);
+
+	/* Receive a packet */
+	iov.iov_base     = buffer;
+	iov.iov_len      = (size_t)buflen;
+
+	len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+
+	if (len < 0)
+		return -1;
+
+	LeaveFunction(7);
+	return len;
+}
+
+
+static DECLARE_WAIT_QUEUE_HEAD(sync_wait);
+static pid_t sync_master_pid = 0;
+static pid_t sync_backup_pid = 0;
+
+static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait);
+static int stop_master_sync = 0;
+static int stop_backup_sync = 0;
+
+static void sync_master_loop(void)
+{
+	struct socket *sock;
+	struct ip_vs_sync_buff *sb;
+
+	/* create the sending multicast socket */
+	sock = make_send_sock();
+	if (!sock)
+		return;
+
+	IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, "
+		   "syncid = %d\n",
+		   ip_vs_master_mcast_ifn, ip_vs_master_syncid);
+
+	for (;;) {
+		while ((sb=sb_dequeue())) {
+			ip_vs_send_sync_msg(sock, sb->mesg);
+			ip_vs_sync_buff_release(sb);
+		}
+
+		/* check if entries stay in curr_sb for 2 seconds */
+		if ((sb = get_curr_sync_buff(2*HZ))) {
+			ip_vs_send_sync_msg(sock, sb->mesg);
+			ip_vs_sync_buff_release(sb);
+		}
+
+		if (stop_master_sync)
+			break;
+
+		ssleep(1);
+	}
+
+	/* clean up the sync_buff queue */
+	while ((sb=sb_dequeue())) {
+		ip_vs_sync_buff_release(sb);
+	}
+
+	/* clean up the current sync_buff */
+	if ((sb = get_curr_sync_buff(0))) {
+		ip_vs_sync_buff_release(sb);
+	}
+
+	/* release the sending multicast socket */
+	sock_release(sock);
+}
+
+
+static void sync_backup_loop(void)
+{
+	struct socket *sock;
+	char *buf;
+	int len;
+
+	if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
+		IP_VS_ERR("sync_backup_loop: kmalloc error\n");
+		return;
+	}
+
+	/* create the receiving multicast socket */
+	sock = make_receive_sock();
+	if (!sock)
+		goto out;
+
+	IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, "
+		   "syncid = %d\n",
+		   ip_vs_backup_mcast_ifn, ip_vs_backup_syncid);
+
+	for (;;) {
+		/* do you have data now? */
+		while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
+			if ((len =
+			     ip_vs_receive(sock, buf,
+					   sync_recv_mesg_maxlen)) <= 0) {
+				IP_VS_ERR("receiving message error\n");
+				break;
+			}
+			/* disable bottom half, because it accessed the data
+			   shared by softirq while getting/creating conns */
+			local_bh_disable();
+			ip_vs_process_message(buf, len);
+			local_bh_enable();
+		}
+
+		if (stop_backup_sync)
+			break;
+
+		ssleep(1);
+	}
+
+	/* release the sending multicast socket */
+	sock_release(sock);
+
+  out:
+	kfree(buf);
+}
+
+
+static void set_sync_pid(int sync_state, pid_t sync_pid)
+{
+	if (sync_state == IP_VS_STATE_MASTER)
+		sync_master_pid = sync_pid;
+	else if (sync_state == IP_VS_STATE_BACKUP)
+		sync_backup_pid = sync_pid;
+}
+
+static void set_stop_sync(int sync_state, int set)
+{
+	if (sync_state == IP_VS_STATE_MASTER)
+		stop_master_sync = set;
+	else if (sync_state == IP_VS_STATE_BACKUP)
+		stop_backup_sync = set;
+	else {
+		stop_master_sync = set;
+		stop_backup_sync = set;
+	}
+}
+
+static int sync_thread(void *startup)
+{
+	DECLARE_WAITQUEUE(wait, current);
+	mm_segment_t oldmm;
+	int state;
+	const char *name;
+
+	/* increase the module use count */
+	ip_vs_use_count_inc();
+
+	if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) {
+		state = IP_VS_STATE_MASTER;
+		name = "ipvs_syncmaster";
+	} else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) {
+		state = IP_VS_STATE_BACKUP;
+		name = "ipvs_syncbackup";
+	} else {
+		IP_VS_BUG();
+		ip_vs_use_count_dec();
+		return -EINVAL;
+	}
+
+	daemonize(name);
+
+	oldmm = get_fs();
+	set_fs(KERNEL_DS);
+
+	/* Block all signals */
+	spin_lock_irq(&current->sighand->siglock);
+	siginitsetinv(&current->blocked, 0);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sighand->siglock);
+
+	/* set the maximum length of sync message */
+	set_sync_mesg_maxlen(state);
+
+	/* set up multicast address */
+	mcast_addr.sin_family = AF_INET;
+	mcast_addr.sin_port = htons(IP_VS_SYNC_PORT);
+	mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP);
+
+	add_wait_queue(&sync_wait, &wait);
+
+	set_sync_pid(state, current->pid);
+	complete((struct completion *)startup);
+
+	/* processing master/backup loop here */
+	if (state == IP_VS_STATE_MASTER)
+		sync_master_loop();
+	else if (state == IP_VS_STATE_BACKUP)
+		sync_backup_loop();
+	else IP_VS_BUG();
+
+	remove_wait_queue(&sync_wait, &wait);
+
+	/* thread exits */
+	set_sync_pid(state, 0);
+	IP_VS_INFO("sync thread stopped!\n");
+
+	set_fs(oldmm);
+
+	/* decrease the module use count */
+	ip_vs_use_count_dec();
+
+	set_stop_sync(state, 0);
+	wake_up(&stop_sync_wait);
+
+	return 0;
+}
+
+
+static int fork_sync_thread(void *startup)
+{
+	pid_t pid;
+
+	/* fork the sync thread here, then the parent process of the
+	   sync thread is the init process after this thread exits. */
+  repeat:
+	if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) {
+		IP_VS_ERR("could not create sync_thread due to %d... "
+			  "retrying.\n", pid);
+		ssleep(1);
+		goto repeat;
+	}
+
+	return 0;
+}
+
+
+int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
+{
+	DECLARE_COMPLETION(startup);
+	pid_t pid;
+
+	if ((state == IP_VS_STATE_MASTER && sync_master_pid) ||
+	    (state == IP_VS_STATE_BACKUP && sync_backup_pid))
+		return -EEXIST;
+
+	IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+	IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n",
+		  sizeof(struct ip_vs_sync_conn));
+
+	ip_vs_sync_state |= state;
+	if (state == IP_VS_STATE_MASTER) {
+		strcpy(ip_vs_master_mcast_ifn, mcast_ifn);
+		ip_vs_master_syncid = syncid;
+	} else {
+		strcpy(ip_vs_backup_mcast_ifn, mcast_ifn);
+		ip_vs_backup_syncid = syncid;
+	}
+
+  repeat:
+	if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) {
+		IP_VS_ERR("could not create fork_sync_thread due to %d... "
+			  "retrying.\n", pid);
+		ssleep(1);
+		goto repeat;
+	}
+
+	wait_for_completion(&startup);
+
+	return 0;
+}
+
+
+int stop_sync_thread(int state)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	if ((state == IP_VS_STATE_MASTER && !sync_master_pid) ||
+	    (state == IP_VS_STATE_BACKUP && !sync_backup_pid))
+		return -ESRCH;
+
+	IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid);
+	IP_VS_INFO("stopping sync thread %d ...\n",
+		   (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid);
+
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&stop_sync_wait, &wait);
+	set_stop_sync(state, 1);
+	ip_vs_sync_state -= state;
+	wake_up(&sync_wait);
+	schedule();
+	__set_current_state(TASK_RUNNING);
+	remove_wait_queue(&stop_sync_wait, &wait);
+
+	/* Note: no need to reap the sync thread, because its parent
+	   process is the init process */
+
+	if ((state == IP_VS_STATE_MASTER && stop_master_sync) ||
+	    (state == IP_VS_STATE_BACKUP && stop_backup_sync))
+		IP_VS_BUG();
+
+	return 0;
+}
diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c
new file mode 100644
index 000000000000..8a9d913261d8
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wlc.c
@@ -0,0 +1,151 @@
+/*
+ * IPVS:        Weighted Least-Connection Scheduling module
+ *
+ * Version:     $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Peter Kese <peter.kese@ijs.si>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wlc_schedule to return dest
+ *     Wensong Zhang            :     changed to use the inactconns in scheduling
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wlc_update_svc
+ *     Wensong Zhang            :     added any dest with weight=0 is quiesced
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+
+static int
+ip_vs_wlc_init_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_wlc_done_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static int
+ip_vs_wlc_update_svc(struct ip_vs_service *svc)
+{
+	return 0;
+}
+
+
+static inline unsigned int
+ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
+{
+	/*
+	 * We think the overhead of processing active connections is 256
+	 * times higher than that of inactive connections in average. (This
+	 * 256 times might not be accurate, we will change it later) We
+	 * use the following formula to estimate the overhead now:
+	 *		  dest->activeconns*256 + dest->inactconns
+	 */
+	return (atomic_read(&dest->activeconns) << 8) +
+		atomic_read(&dest->inactconns);
+}
+
+
+/*
+ *	Weighted Least Connection scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest, *least;
+	unsigned int loh, doh;
+
+	IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n");
+
+	/*
+	 * We calculate the load of each dest server as follows:
+	 *		  (dest overhead) / dest->weight
+	 *
+	 * Remember -- no floats in kernel mode!!!
+	 * The comparison of h1*w2 > h2*w1 is equivalent to that of
+	 *		  h1/w1 > h2/w2
+	 * if every weight is larger than zero.
+	 *
+	 * The server with weight=0 is quiesced and will not receive any
+	 * new connections.
+	 */
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+		    atomic_read(&dest->weight) > 0) {
+			least = dest;
+			loh = ip_vs_wlc_dest_overhead(least);
+			goto nextstage;
+		}
+	}
+	return NULL;
+
+	/*
+	 *    Find the destination with the least load.
+	 */
+  nextstage:
+	list_for_each_entry_continue(dest, &svc->destinations, n_list) {
+		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
+			continue;
+		doh = ip_vs_wlc_dest_overhead(dest);
+		if (loh * atomic_read(&dest->weight) >
+		    doh * atomic_read(&least->weight)) {
+			least = dest;
+			loh = doh;
+		}
+	}
+
+	IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u "
+		  "activeconns %d refcnt %d weight %d overhead %d\n",
+		  NIPQUAD(least->addr), ntohs(least->port),
+		  atomic_read(&least->activeconns),
+		  atomic_read(&least->refcnt),
+		  atomic_read(&least->weight), loh);
+
+	return least;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wlc_scheduler =
+{
+	.name =			"wlc",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_wlc_init_svc,
+	.done_service =		ip_vs_wlc_done_svc,
+	.update_service =	ip_vs_wlc_update_svc,
+	.schedule =		ip_vs_wlc_schedule,
+};
+
+
+static int __init ip_vs_wlc_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+static void __exit ip_vs_wlc_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler);
+}
+
+module_init(ip_vs_wlc_init);
+module_exit(ip_vs_wlc_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c
new file mode 100644
index 000000000000..749fa044eca5
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_wrr.c
@@ -0,0 +1,235 @@
+/*
+ * IPVS:        Weighted Round-Robin Scheduling module
+ *
+ * Version:     $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *     Wensong Zhang            :     changed the ip_vs_wrr_schedule to return dest
+ *     Wensong Zhang            :     changed some comestics things for debugging
+ *     Wensong Zhang            :     changed for the d-linked destination list
+ *     Wensong Zhang            :     added the ip_vs_wrr_update_svc
+ *     Julian Anastasov         :     fixed the bug of returning destination
+ *                                    with weight 0 when all weights are zero
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <net/ip_vs.h>
+
+/*
+ * current destination pointer for weighted round-robin scheduling
+ */
+struct ip_vs_wrr_mark {
+	struct list_head *cl;	/* current list head */
+	int cw;			/* current weight */
+	int mw;			/* maximum weight */
+	int di;			/* decreasing interval */
+};
+
+
+/*
+ *    Get the gcd of server weights
+ */
+static int gcd(int a, int b)
+{
+	int c;
+
+	while ((c = a % b)) {
+		a = b;
+		b = c;
+	}
+	return b;
+}
+
+static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight;
+	int g = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		weight = atomic_read(&dest->weight);
+		if (weight > 0) {
+			if (g > 0)
+				g = gcd(weight, g);
+			else
+				g = weight;
+		}
+	}
+	return g ? g : 1;
+}
+
+
+/*
+ *    Get the maximum weight of the service destinations.
+ */
+static int ip_vs_wrr_max_weight(struct ip_vs_service *svc)
+{
+	struct ip_vs_dest *dest;
+	int weight = 0;
+
+	list_for_each_entry(dest, &svc->destinations, n_list) {
+		if (atomic_read(&dest->weight) > weight)
+			weight = atomic_read(&dest->weight);
+	}
+
+	return weight;
+}
+
+
+static int ip_vs_wrr_init_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_wrr_mark *mark;
+
+	/*
+	 *    Allocate the mark variable for WRR scheduling
+	 */
+	mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC);
+	if (mark == NULL) {
+		IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n");
+		return -ENOMEM;
+	}
+	mark->cl = &svc->destinations;
+	mark->cw = 0;
+	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->di = ip_vs_wrr_gcd_weight(svc);
+	svc->sched_data = mark;
+
+	return 0;
+}
+
+
+static int ip_vs_wrr_done_svc(struct ip_vs_service *svc)
+{
+	/*
+	 *    Release the mark variable
+	 */
+	kfree(svc->sched_data);
+
+	return 0;
+}
+
+
+static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
+{
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+
+	mark->cl = &svc->destinations;
+	mark->mw = ip_vs_wrr_max_weight(svc);
+	mark->di = ip_vs_wrr_gcd_weight(svc);
+	if (mark->cw > mark->mw)
+		mark->cw = 0;
+	return 0;
+}
+
+
+/*
+ *    Weighted Round-Robin Scheduling
+ */
+static struct ip_vs_dest *
+ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
+{
+	struct ip_vs_dest *dest;
+	struct ip_vs_wrr_mark *mark = svc->sched_data;
+	struct list_head *p;
+
+	IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n");
+
+	/*
+	 * This loop will always terminate, because mark->cw in (0, max_weight]
+	 * and at least one server has its weight equal to max_weight.
+	 */
+	write_lock(&svc->sched_lock);
+	p = mark->cl;
+	while (1) {
+		if (mark->cl == &svc->destinations) {
+			/* it is at the head of the destination list */
+
+			if (mark->cl == mark->cl->next) {
+				/* no dest entry */
+				dest = NULL;
+				goto out;
+			}
+
+			mark->cl = svc->destinations.next;
+			mark->cw -= mark->di;
+			if (mark->cw <= 0) {
+				mark->cw = mark->mw;
+				/*
+				 * Still zero, which means no available servers.
+				 */
+				if (mark->cw == 0) {
+					mark->cl = &svc->destinations;
+					IP_VS_INFO("ip_vs_wrr_schedule(): "
+						   "no available servers\n");
+					dest = NULL;
+					goto out;
+				}
+			}
+		} else
+			mark->cl = mark->cl->next;
+
+		if (mark->cl != &svc->destinations) {
+			/* not at the head of the list */
+			dest = list_entry(mark->cl, struct ip_vs_dest, n_list);
+			if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
+			    atomic_read(&dest->weight) >= mark->cw) {
+				/* got it */
+				break;
+			}
+		}
+
+		if (mark->cl == p && mark->cw == mark->di) {
+			/* back to the start, and no dest is found.
+			   It is only possible when all dests are OVERLOADED */
+			dest = NULL;
+			goto out;
+		}
+	}
+
+	IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u "
+		  "activeconns %d refcnt %d weight %d\n",
+		  NIPQUAD(dest->addr), ntohs(dest->port),
+		  atomic_read(&dest->activeconns),
+		  atomic_read(&dest->refcnt),
+		  atomic_read(&dest->weight));
+
+  out:
+	write_unlock(&svc->sched_lock);
+	return dest;
+}
+
+
+static struct ip_vs_scheduler ip_vs_wrr_scheduler = {
+	.name =			"wrr",
+	.refcnt =		ATOMIC_INIT(0),
+	.module =		THIS_MODULE,
+	.init_service =		ip_vs_wrr_init_svc,
+	.done_service =		ip_vs_wrr_done_svc,
+	.update_service =	ip_vs_wrr_update_svc,
+	.schedule =		ip_vs_wrr_schedule,
+};
+
+static int __init ip_vs_wrr_init(void)
+{
+	INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list);
+	return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ;
+}
+
+static void __exit ip_vs_wrr_cleanup(void)
+{
+	unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler);
+}
+
+module_init(ip_vs_wrr_init);
+module_exit(ip_vs_wrr_cleanup);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c
new file mode 100644
index 000000000000..faa6176bbeb1
--- /dev/null
+++ b/net/ipv4/ipvs/ip_vs_xmit.c
@@ -0,0 +1,563 @@
+/*
+ * ip_vs_xmit.c: various packet transmitters for IPVS
+ *
+ * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
+ *
+ * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
+ *              Julian Anastasov <ja@ssi.bg>
+ *
+ *              This program is free software; you can redistribute it and/or
+ *              modify it under the terms of the GNU General Public License
+ *              as published by the Free Software Foundation; either version
+ *              2 of the License, or (at your option) any later version.
+ *
+ * Changes:
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>                  /* for tcphdr */
+#include <net/tcp.h>                    /* for csum_tcpudp_magic */
+#include <net/udp.h>
+#include <net/icmp.h>                   /* for icmp_send */
+#include <net/route.h>                  /* for ip_route_output */
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip_vs.h>
+
+
+/*
+ *      Destination cache to speed up outgoing route lookup
+ */
+static inline void
+__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = dst;
+	dest->dst_rtos = rtos;
+	dst_release(old_dst);
+}
+
+static inline struct dst_entry *
+__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
+{
+	struct dst_entry *dst = dest->dst_cache;
+
+	if (!dst)
+		return NULL;
+	if ((dst->obsolete || rtos != dest->dst_rtos) &&
+	    dst->ops->check(dst, cookie) == NULL) {
+		dest->dst_cache = NULL;
+		dst_release(dst);
+		return NULL;
+	}
+	dst_hold(dst);
+	return dst;
+}
+
+static inline struct rtable *
+__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct ip_vs_dest *dest = cp->dest;
+
+	if (dest) {
+		spin_lock(&dest->dst_lock);
+		if (!(rt = (struct rtable *)
+		      __ip_vs_dst_check(dest, rtos, 0))) {
+			struct flowi fl = {
+				.oif = 0,
+				.nl_u = {
+					.ip4_u = {
+						.daddr = dest->addr,
+						.saddr = 0,
+						.tos = rtos, } },
+			};
+
+			if (ip_route_output_key(&rt, &fl)) {
+				spin_unlock(&dest->dst_lock);
+				IP_VS_DBG_RL("ip_route_output error, "
+					     "dest: %u.%u.%u.%u\n",
+					     NIPQUAD(dest->addr));
+				return NULL;
+			}
+			__ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
+			IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
+				  NIPQUAD(dest->addr),
+				  atomic_read(&rt->u.dst.__refcnt), rtos);
+		}
+		spin_unlock(&dest->dst_lock);
+	} else {
+		struct flowi fl = {
+			.oif = 0,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = cp->daddr,
+					.saddr = 0,
+					.tos = rtos, } },
+		};
+
+		if (ip_route_output_key(&rt, &fl)) {
+			IP_VS_DBG_RL("ip_route_output error, dest: "
+				     "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
+			return NULL;
+		}
+	}
+
+	return rt;
+}
+
+
+/*
+ *	Release dest->dst_cache before a dest is removed
+ */
+void
+ip_vs_dst_reset(struct ip_vs_dest *dest)
+{
+	struct dst_entry *old_dst;
+
+	old_dst = dest->dst_cache;
+	dest->dst_cache = NULL;
+	dst_release(old_dst);
+}
+
+#define IP_VS_XMIT(skb, rt)				\
+do {							\
+	nf_reset_debug(skb);				\
+	(skb)->nfcache |= NFC_IPVS_PROPERTY;		\
+	(skb)->ip_summed = CHECKSUM_NONE;		\
+	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,	\
+		(rt)->u.dst.dev, dst_output);		\
+} while (0)
+
+
+/*
+ *      NULL transmitter (do nothing except return NF_ACCEPT)
+ */
+int
+ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp)
+{
+	/* we do not touch skb and do not need pskb ptr */
+	return NF_ACCEPT;
+}
+
+
+/*
+ *      Bypass transmitter
+ *      Let packets bypass the destination when the destination is not
+ *      available, it may be only used in transparent cache cluster.
+ */
+int
+ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = skb->nh.iph;
+	u8     tos = iph->tos;
+	int    mtu;
+	struct flowi fl = {
+		.oif = 0,
+		.nl_u = {
+			.ip4_u = {
+				.daddr = iph->daddr,
+				.saddr = 0,
+				.tos = RT_TOS(tos), } },
+	};
+
+	EnterFunction(10);
+
+	if (ip_route_output_key(&rt, &fl)) {
+		IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
+			     "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
+		goto tx_error_icmp;
+	}
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+		ip_rt_put(rt);
+		return NF_STOLEN;
+	}
+	ip_send_check(skb->nh.iph);
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+ tx_error_icmp:
+	dst_link_failure(skb);
+ tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+
+/*
+ *      NAT transmitter (only for outside-to-inside nat forwarding)
+ *      Not used for related ICMP
+ */
+int
+ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+	       struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;		/* Route to the other host */
+	int mtu;
+	struct iphdr *iph = skb->nh.iph;
+
+	EnterFunction(10);
+
+	/* check if it is a connection of no-client-port */
+	if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
+		__u16 _pt, *p;
+		p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
+		if (p == NULL)
+			goto tx_error;
+		ip_vs_conn_fill_cport(cp, *p);
+		IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* mangle the packet */
+	if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
+		goto tx_error;
+	skb->nh.iph->daddr = cp->daddr;
+	ip_send_check(skb->nh.iph);
+
+	IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
+
+	/* FIXME: when application helper enlarges the packet and the length
+	   is larger than the MTU of outgoing device, there will be still
+	   MTU problem. */
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	LeaveFunction(10);
+	kfree_skb(skb);
+	return NF_STOLEN;
+  tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
+
+
+/*
+ *   IP Tunneling transmitter
+ *
+ *   This function encapsulates the packet in a new IP packet, its
+ *   destination will be set to cp->daddr. Most code of this function
+ *   is taken from ipip.c.
+ *
+ *   It is used in VS/TUN cluster. The load balancer selects a real
+ *   server from a cluster based on a scheduling algorithm,
+ *   encapsulates the request packet and forwards it to the selected
+ *   server. For example, all real servers are configured with
+ *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
+ *   the encapsulated packet, it will decapsulate the packet, processe
+ *   the request and return the response packets directly to the client
+ *   without passing the load balancer. This can greatly increase the
+ *   scalability of virtual server.
+ *
+ *   Used for ANY protocol
+ */
+int
+ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		  struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct net_device *tdev;		/* Device to other host */
+	struct iphdr  *old_iph = skb->nh.iph;
+	u8     tos = old_iph->tos;
+	u16    df = old_iph->frag_off;
+	struct iphdr  *iph;			/* Our new IP header */
+	int    max_headroom;			/* The extra header space needed */
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (skb->protocol != __constant_htons(ETH_P_IP)) {
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
+			     "ETH_P_IP: %d, skb protocol: %d\n",
+			     __constant_htons(ETH_P_IP), skb->protocol);
+		goto tx_error;
+	}
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
+		goto tx_error_icmp;
+
+	tdev = rt->u.dst.dev;
+
+	mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
+	if (mtu < 68) {
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
+		goto tx_error;
+	}
+	if (skb->dst)
+		skb->dst->ops->update_pmtu(skb->dst, mtu);
+
+	df |= (old_iph->frag_off&__constant_htons(IP_DF));
+
+	if ((old_iph->frag_off&__constant_htons(IP_DF))
+	    && mtu < ntohs(old_iph->tot_len)) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Okay, now see if we can stuff it in the buffer as-is.
+	 */
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
+
+	if (skb_headroom(skb) < max_headroom
+	    || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb =
+			skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			kfree_skb(skb);
+			IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
+			return NF_STOLEN;
+		}
+		kfree_skb(skb);
+		skb = new_skb;
+		old_iph = skb->nh.iph;
+	}
+
+	skb->h.raw = (void *) old_iph;
+
+	/* fix old IP header checksum */
+	ip_send_check(old_iph);
+
+	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/*
+	 *	Push down and install the IPIP header.
+	 */
+	iph			=	skb->nh.iph;
+	iph->version		=	4;
+	iph->ihl		=	sizeof(struct iphdr)>>2;
+	iph->frag_off		=	df;
+	iph->protocol		=	IPPROTO_IPIP;
+	iph->tos		=	tos;
+	iph->daddr		=	rt->rt_dst;
+	iph->saddr		=	rt->rt_src;
+	iph->ttl		=	old_iph->ttl;
+	iph->tot_len		=	htons(skb->len);
+	ip_select_ident(iph, &rt->u.dst, NULL);
+	ip_send_check(iph);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(skb, rt);
+
+	LeaveFunction(10);
+
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+
+/*
+ *      Direct Routing transmitter
+ *      Used for ANY protocol
+ */
+int
+ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+	      struct ip_vs_protocol *pp)
+{
+	struct rtable *rt;			/* Route to the other host */
+	struct iphdr  *iph = skb->nh.iph;
+	int    mtu;
+
+	EnterFunction(10);
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
+		icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
+		ip_rt_put(rt);
+		IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
+		goto tx_error;
+	}
+
+	/*
+	 * Call ip_send_check because we are not sure it is called
+	 * after ip_defrag. Is copy-on-write needed?
+	 */
+	if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
+		ip_rt_put(rt);
+		return NF_STOLEN;
+	}
+	ip_send_check(skb->nh.iph);
+
+	/* drop old route */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(skb, rt);
+
+	LeaveFunction(10);
+	return NF_STOLEN;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	kfree_skb(skb);
+	LeaveFunction(10);
+	return NF_STOLEN;
+}
+
+
+/*
+ *	ICMP packet transmitter
+ *	called by the ip_vs_in_icmp
+ */
+int
+ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
+		struct ip_vs_protocol *pp, int offset)
+{
+	struct rtable	*rt;	/* Route to the other host */
+	int mtu;
+	int rc;
+
+	EnterFunction(10);
+
+	/* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
+	   forwarded directly here, because there is no need to
+	   translate address/port back */
+	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
+		if (cp->packet_xmit)
+			rc = cp->packet_xmit(skb, cp, pp);
+		else
+			rc = NF_ACCEPT;
+		/* do not touch skb anymore */
+		atomic_inc(&cp->in_pkts);
+		__ip_vs_conn_put(cp);
+		goto out;
+	}
+
+	/*
+	 * mangle and send the packet here (only for VS/NAT)
+	 */
+
+	if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
+		goto tx_error_icmp;
+
+	/* MTU checking */
+	mtu = dst_mtu(&rt->u.dst);
+	if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
+		ip_rt_put(rt);
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
+		goto tx_error;
+	}
+
+	/* copy-on-write the packet before mangling it */
+	if (!ip_vs_make_skb_writable(&skb, offset))
+		goto tx_error_put;
+
+	if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
+		goto tx_error_put;
+
+	/* drop the old route when skb is not shared */
+	dst_release(skb->dst);
+	skb->dst = &rt->u.dst;
+
+	ip_vs_nat_icmp(skb, pp, cp, 0);
+
+	/* Another hack: avoid icmp_send in ip_fragment */
+	skb->local_df = 1;
+
+	IP_VS_XMIT(skb, rt);
+
+	rc = NF_STOLEN;
+	goto out;
+
+  tx_error_icmp:
+	dst_link_failure(skb);
+  tx_error:
+	dev_kfree_skb(skb);
+	rc = NF_STOLEN;
+  out:
+	LeaveFunction(10);
+	return rc;
+  tx_error_put:
+	ip_rt_put(rt);
+	goto tx_error;
+}
diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c
new file mode 100644
index 000000000000..4e9ca7c76407
--- /dev/null
+++ b/net/ipv4/multipath.c
@@ -0,0 +1,55 @@
+/* multipath.c: IPV4 multipath algorithm support.
+ *
+ * Copyright (C) 2004, 2005 Einar Lueck <elueck@de.ibm.com>
+ * Copyright (C) 2005 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+
+#include <net/ip_mp_alg.h>
+
+static DEFINE_SPINLOCK(alg_table_lock);
+struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1];
+
+int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
+{
+	struct ip_mp_alg_ops **slot;
+	int err;
+
+	if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX ||
+	    !ops->mp_alg_select_route)
+		return -EINVAL;
+
+	spin_lock(&alg_table_lock);
+	slot = &ip_mp_alg_table[n];
+	if (*slot != NULL) {
+		err = -EBUSY;
+	} else {
+		*slot = ops;
+		err = 0;
+	}
+	spin_unlock(&alg_table_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(multipath_alg_register);
+
+void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n)
+{
+	struct ip_mp_alg_ops **slot;
+
+	if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX)
+		return;
+
+	spin_lock(&alg_table_lock);
+	slot = &ip_mp_alg_table[n];
+	if (*slot == ops)
+		*slot = NULL;
+	spin_unlock(&alg_table_lock);
+
+	synchronize_net();
+}
+EXPORT_SYMBOL(multipath_alg_unregister);
diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c
new file mode 100644
index 000000000000..9349686131fc
--- /dev/null
+++ b/net/ipv4/multipath_drr.c
@@ -0,0 +1,265 @@
+/*
+ *              Device round robin policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+
+struct multipath_device {
+	int		ifi; /* interface index of device */
+	atomic_t	usecount;
+	int 		allocated;
+};
+
+#define MULTIPATH_MAX_DEVICECANDIDATES 10
+
+static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES];
+static DEFINE_SPINLOCK(state_lock);
+static struct rtable *last_selection = NULL;
+
+static int inline __multipath_findslot(void)
+{
+	int i;
+
+	for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
+		if (state[i].allocated == 0)
+			return i;
+	}
+	return -1;
+}
+
+static int inline __multipath_finddev(int ifindex)
+{
+	int i;
+
+	for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) {
+		if (state[i].allocated != 0 &&
+		    state[i].ifi == ifindex)
+			return i;
+	}
+	return -1;
+}
+
+static int drr_dev_event(struct notifier_block *this,
+			 unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+	int devidx;
+
+	switch (event) {
+	case NETDEV_UNREGISTER:
+	case NETDEV_DOWN:
+		spin_lock_bh(&state_lock);
+
+		devidx = __multipath_finddev(dev->ifindex);
+		if (devidx != -1) {
+			state[devidx].allocated = 0;
+			state[devidx].ifi = 0;
+			atomic_set(&state[devidx].usecount, 0);
+		}
+
+		spin_unlock_bh(&state_lock);
+		break;
+	};
+
+	return NOTIFY_DONE;
+}
+
+struct notifier_block drr_dev_notifier = {
+	.notifier_call	= drr_dev_event,
+};
+
+static void drr_remove(struct rtable *rt)
+{
+	if (last_selection == rt)
+		last_selection = NULL;
+}
+
+static void drr_safe_inc(atomic_t *usecount)
+{
+	int n;
+
+	atomic_inc(usecount);
+
+	n = atomic_read(usecount);
+	if (n <= 0) {
+		int i;
+
+		spin_lock_bh(&state_lock);
+
+		for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++)
+			atomic_set(&state[i].usecount, 0);
+
+		spin_unlock_bh(&state_lock);
+	}
+}
+
+static void drr_select_route(const struct flowi *flp,
+			     struct rtable *first, struct rtable **rp)
+{
+	struct rtable *nh, *result, *cur_min;
+	int min_usecount = -1; 
+	int devidx = -1;
+	int cur_min_devidx = -1;
+
+       	/* if necessary and possible utilize the old alternative */
+	if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
+	    last_selection != NULL) {
+		result = last_selection;
+		*rp = result;
+		return;
+	}
+
+	/* 1. make sure all alt. nexthops have the same GC related data */
+	/* 2. determine the new candidate to be returned */
+	result = NULL;
+	cur_min = NULL;
+	for (nh = rcu_dereference(first); nh;
+	     nh = rcu_dereference(nh->u.rt_next)) {
+		if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
+		    multipath_comparekeys(&nh->fl, flp)) {
+			int nh_ifidx = nh->u.dst.dev->ifindex;
+
+			nh->u.dst.lastuse = jiffies;
+			nh->u.dst.__use++;
+			if (result != NULL)
+				continue;
+
+			/* search for the output interface */
+
+			/* this is not SMP safe, only add/remove are
+			 * SMP safe as wrong usecount updates have no big
+			 * impact
+			 */
+			devidx = __multipath_finddev(nh_ifidx);
+			if (devidx == -1) {
+				/* add the interface to the array 
+				 * SMP safe
+				 */
+				spin_lock_bh(&state_lock);
+
+				/* due to SMP: search again */
+				devidx = __multipath_finddev(nh_ifidx);
+				if (devidx == -1) {
+					/* add entry for device */
+					devidx = __multipath_findslot();
+					if (devidx == -1) {
+						/* unlikely but possible */
+						continue;
+					}
+
+					state[devidx].allocated = 1;
+					state[devidx].ifi = nh_ifidx;
+					atomic_set(&state[devidx].usecount, 0);
+					min_usecount = 0;
+				}
+
+				spin_unlock_bh(&state_lock);
+			}
+
+			if (min_usecount == 0) {
+				/* if the device has not been used it is
+				 * the primary target
+				 */
+				drr_safe_inc(&state[devidx].usecount);
+				result = nh;
+			} else {
+				int count =
+					atomic_read(&state[devidx].usecount);
+
+				if (min_usecount == -1 ||
+				    count < min_usecount) {
+					cur_min = nh;
+					cur_min_devidx = devidx;
+					min_usecount = count;
+				}
+			}
+		}
+	}
+
+	if (!result) {
+		if (cur_min) {
+			drr_safe_inc(&state[cur_min_devidx].usecount);
+			result = cur_min;
+		} else {
+			result = first;
+		}
+	}
+
+	*rp = result;
+	last_selection = result;
+}
+
+static struct ip_mp_alg_ops drr_ops = {
+	.mp_alg_select_route	=	drr_select_route,
+	.mp_alg_remove		=	drr_remove,
+};
+
+static int __init drr_init(void)
+{
+	int err = register_netdevice_notifier(&drr_dev_notifier);
+
+	if (err)
+		return err;
+
+	err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	unregister_netdevice_notifier(&drr_dev_notifier);
+	return err;
+}
+
+static void __exit drr_exit(void)
+{
+	unregister_netdevice_notifier(&drr_dev_notifier);
+	multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR);
+}
+
+module_init(drr_init);
+module_exit(drr_exit);
diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c
new file mode 100644
index 000000000000..805a16e47de5
--- /dev/null
+++ b/net/ipv4/multipath_random.c
@@ -0,0 +1,128 @@
+/*
+ *              Random policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+/* interface to random number generation */
+static unsigned int RANDOM_SEED = 93186752;
+
+static inline unsigned int random(unsigned int ubound)
+{
+	static unsigned int a = 1588635695,
+		q = 2,
+		r = 1117695901;
+
+	RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
+
+	return RANDOM_SEED % ubound;
+}
+
+
+static void random_select_route(const struct flowi *flp,
+				struct rtable *first,
+				struct rtable **rp)
+{
+	struct rtable *rt;
+	struct rtable *decision;
+	unsigned char candidate_count = 0;
+
+	/* count all candidate */
+	for (rt = rcu_dereference(first); rt;
+	     rt = rcu_dereference(rt->u.rt_next)) {
+		if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+		    multipath_comparekeys(&rt->fl, flp))
+			++candidate_count;
+	}
+
+	/* choose a random candidate */
+	decision = first;
+	if (candidate_count > 1) {
+		unsigned char i = 0;
+		unsigned char candidate_no = (unsigned char)
+			random(candidate_count);
+
+		/* find chosen candidate and adjust GC data for all candidates
+		 * to ensure they stay in cache
+		 */
+		for (rt = first; rt; rt = rt->u.rt_next) {
+			if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+			    multipath_comparekeys(&rt->fl, flp)) {
+				rt->u.dst.lastuse = jiffies;
+
+				if (i == candidate_no)
+					decision = rt;
+
+				if (i >= candidate_count)
+					break;
+
+				i++;
+			}
+		}
+	}
+
+	decision->u.dst.__use++;
+	*rp = decision;
+}
+
+static struct ip_mp_alg_ops random_ops = {
+	.mp_alg_select_route	=	random_select_route,
+};
+
+static int __init random_init(void)
+{
+	return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM);
+}
+
+static void __exit random_exit(void)
+{
+	multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM);
+}
+
+module_init(random_init);
+module_exit(random_exit);
diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c
new file mode 100644
index 000000000000..554a82568160
--- /dev/null
+++ b/net/ipv4/multipath_rr.c
@@ -0,0 +1,115 @@
+/*
+ *              Round robin policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_mp_alg.h>
+
+#define MULTIPATH_MAX_CANDIDATES 40
+
+static struct rtable* last_used = NULL;
+
+static void rr_remove(struct rtable *rt)
+{
+	if (last_used == rt)
+		last_used = NULL;
+}
+
+static void rr_select_route(const struct flowi *flp,
+			    struct rtable *first, struct rtable **rp)
+{
+	struct rtable *nh, *result, *min_use_cand = NULL;
+	int min_use = -1;
+
+	/* if necessary and possible utilize the old alternative */
+	if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 &&
+	    last_used != NULL) {
+		result = last_used;
+		goto out;
+	}
+
+	/* 1. make sure all alt. nexthops have the same GC related data
+	 * 2. determine the new candidate to be returned
+	 */
+	result = NULL;
+	for (nh = rcu_dereference(first); nh;
+ 	     nh = rcu_dereference(nh->u.rt_next)) {
+		if ((nh->u.dst.flags & DST_BALANCED) != 0 &&
+		    multipath_comparekeys(&nh->fl, flp)) {
+			nh->u.dst.lastuse = jiffies;
+
+			if (min_use == -1 || nh->u.dst.__use < min_use) {
+				min_use = nh->u.dst.__use;
+				min_use_cand = nh;
+			}
+		}
+	}
+	result = min_use_cand;
+	if (!result)
+		result = first;
+
+out:
+	last_used = result;
+	result->u.dst.__use++;
+	*rp = result;
+}
+
+static struct ip_mp_alg_ops rr_ops = {
+	.mp_alg_select_route	=	rr_select_route,
+	.mp_alg_remove		=	rr_remove,
+};
+
+static int __init rr_init(void)
+{
+	return multipath_alg_register(&rr_ops, IP_MP_ALG_RR);
+}
+
+static void __exit rr_exit(void)
+{
+	multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR);
+}
+
+module_init(rr_init);
+module_exit(rr_exit);
diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c
new file mode 100644
index 000000000000..10b23e1bece6
--- /dev/null
+++ b/net/ipv4/multipath_wrandom.c
@@ -0,0 +1,344 @@
+/*
+ *              Weighted random policy for multipath.
+ *
+ *
+ * Version:	$Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $
+ *
+ * Authors:	Einar Lueck <elueck@de.ibm.com><lkml@einar-lueck.de>
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/fcntl.h>
+#include <linux/stat.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/mroute.h>
+#include <linux/init.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <linux/notifier.h>
+#include <linux/if_arp.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ipip.h>
+#include <net/checksum.h>
+#include <net/ip_fib.h>
+#include <net/ip_mp_alg.h>
+
+#define MULTIPATH_STATE_SIZE 15
+
+struct multipath_candidate {
+	struct multipath_candidate	*next;
+	int				power;
+	struct rtable			*rt;
+};
+
+struct multipath_dest {
+	struct list_head	list;
+
+	const struct fib_nh	*nh_info;
+	__u32			netmask;
+	__u32			network;
+	unsigned char		prefixlen;
+
+	struct rcu_head		rcu;
+};
+
+struct multipath_bucket {
+	struct list_head	head;
+	spinlock_t		lock;
+};
+
+struct multipath_route {
+	struct list_head	list;
+
+	int			oif;
+	__u32			gw;
+	struct list_head	dests;
+
+	struct rcu_head		rcu;
+};
+
+/* state: primarily weight per route information */
+static struct multipath_bucket state[MULTIPATH_STATE_SIZE];
+
+/* interface to random number generation */
+static unsigned int RANDOM_SEED = 93186752;
+
+static inline unsigned int random(unsigned int ubound)
+{
+	static unsigned int a = 1588635695,
+		q = 2,
+		r = 1117695901;
+	RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q);
+	return RANDOM_SEED % ubound;
+}
+
+static unsigned char __multipath_lookup_weight(const struct flowi *fl,
+					       const struct rtable *rt)
+{
+	const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE;
+	struct multipath_route *r;
+	struct multipath_route *target_route = NULL;
+	struct multipath_dest *d;
+	int weight = 1;
+
+	/* lookup the weight information for a certain route */
+	rcu_read_lock();
+
+	/* find state entry for gateway or add one if necessary */
+	list_for_each_entry_rcu(r, &state[state_idx].head, list) {
+		if (r->gw == rt->rt_gateway &&
+		    r->oif == rt->idev->dev->ifindex) {
+			target_route = r;
+			break;
+		}
+	}
+
+	if (!target_route) {
+		/* this should not happen... but we are prepared */
+		printk( KERN_CRIT"%s: missing state for gateway: %u and " \
+			"device %d\n", __FUNCTION__, rt->rt_gateway,
+			rt->idev->dev->ifindex);
+		goto out;
+	}
+
+	/* find state entry for destination */
+	list_for_each_entry_rcu(d, &target_route->dests, list) {
+		__u32 targetnetwork = fl->fl4_dst & 
+			(0xFFFFFFFF >> (32 - d->prefixlen));
+
+		if ((targetnetwork & d->netmask) == d->network) {
+			weight = d->nh_info->nh_weight;
+			goto out;
+		}
+	}
+
+out:
+	rcu_read_unlock();
+	return weight;
+}
+
+static void wrandom_init_state(void) 
+{
+	int i;
+
+	for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
+		INIT_LIST_HEAD(&state[i].head);
+		spin_lock_init(&state[i].lock);
+	}
+}
+
+static void wrandom_select_route(const struct flowi *flp,
+				 struct rtable *first,
+				 struct rtable **rp)
+{
+	struct rtable *rt;
+	struct rtable *decision;
+	struct multipath_candidate *first_mpc = NULL;
+	struct multipath_candidate *mpc, *last_mpc = NULL;
+	int power = 0;
+	int last_power;
+	int selector;
+	const size_t size_mpc = sizeof(struct multipath_candidate);
+
+	/* collect all candidates and identify their weights */
+	for (rt = rcu_dereference(first); rt;
+	     rt = rcu_dereference(rt->u.rt_next)) {
+		if ((rt->u.dst.flags & DST_BALANCED) != 0 &&
+		    multipath_comparekeys(&rt->fl, flp)) {
+			struct multipath_candidate* mpc =
+				(struct multipath_candidate*)
+				kmalloc(size_mpc, GFP_KERNEL);
+
+			if (!mpc)
+				return;
+
+			power += __multipath_lookup_weight(flp, rt) * 10000;
+
+			mpc->power = power;
+			mpc->rt = rt;
+			mpc->next = NULL;
+
+			if (!first_mpc)
+				first_mpc = mpc;
+			else
+				last_mpc->next = mpc;
+
+			last_mpc = mpc;
+		}
+	}
+
+	/* choose a weighted random candidate */
+	decision = first;
+	selector = random(power);
+	last_power = 0;
+
+	/* select candidate, adjust GC data and cleanup local state */
+	decision = first;
+	last_mpc = NULL;
+	for (mpc = first_mpc; mpc; mpc = mpc->next) {
+		mpc->rt->u.dst.lastuse = jiffies;
+		if (last_power <= selector && selector < mpc->power)
+			decision = mpc->rt;
+
+		last_power = mpc->power;
+		if (last_mpc)
+			kfree(last_mpc);
+
+		last_mpc = mpc;
+	}
+
+	if (last_mpc) {
+		/* concurrent __multipath_flush may lead to !last_mpc */
+		kfree(last_mpc);
+	}
+
+	decision->u.dst.__use++;
+	*rp = decision;
+}
+
+static void wrandom_set_nhinfo(__u32 network,
+			       __u32 netmask,
+			       unsigned char prefixlen,
+			       const struct fib_nh *nh)
+{
+	const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE;
+	struct multipath_route *r, *target_route = NULL;
+	struct multipath_dest *d, *target_dest = NULL;
+
+	/* store the weight information for a certain route */
+	spin_lock(&state[state_idx].lock);
+
+	/* find state entry for gateway or add one if necessary */
+	list_for_each_entry_rcu(r, &state[state_idx].head, list) {
+		if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) {
+			target_route = r;
+			break;
+		}
+	}
+
+	if (!target_route) {
+		const size_t size_rt = sizeof(struct multipath_route);
+		target_route = (struct multipath_route *)
+			kmalloc(size_rt, GFP_KERNEL);
+
+		target_route->gw = nh->nh_gw;
+		target_route->oif = nh->nh_oif;
+		memset(&target_route->rcu, 0, sizeof(struct rcu_head));
+		INIT_LIST_HEAD(&target_route->dests);
+
+		list_add_rcu(&target_route->list, &state[state_idx].head);
+	}
+
+	/* find state entry for destination or add one if necessary */
+	list_for_each_entry_rcu(d, &target_route->dests, list) {
+		if (d->nh_info == nh) {
+			target_dest = d;
+			break;
+		}
+	}
+
+	if (!target_dest) {
+		const size_t size_dst = sizeof(struct multipath_dest);
+		target_dest = (struct multipath_dest*)
+			kmalloc(size_dst, GFP_KERNEL);
+
+		target_dest->nh_info = nh;
+		target_dest->network = network;
+		target_dest->netmask = netmask;
+		target_dest->prefixlen = prefixlen;
+		memset(&target_dest->rcu, 0, sizeof(struct rcu_head));
+
+		list_add_rcu(&target_dest->list, &target_route->dests);
+	}
+	/* else: we already stored this info for another destination =>
+	 * we are finished
+	 */
+
+	spin_unlock(&state[state_idx].lock);
+}
+
+static void __multipath_free(struct rcu_head *head)
+{
+	struct multipath_route *rt = container_of(head, struct multipath_route,
+						  rcu);
+	kfree(rt);
+}
+
+static void __multipath_free_dst(struct rcu_head *head)
+{
+  	struct multipath_dest *dst = container_of(head,
+						  struct multipath_dest,
+						  rcu);
+	kfree(dst);
+}
+
+static void wrandom_flush(void)
+{
+	int i;
+
+	/* defere delete to all entries */
+	for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) {
+		struct multipath_route *r;
+
+		spin_lock(&state[i].lock);
+		list_for_each_entry_rcu(r, &state[i].head, list) {
+			struct multipath_dest *d;
+			list_for_each_entry_rcu(d, &r->dests, list) {
+				list_del_rcu(&d->list);
+				call_rcu(&d->rcu,
+					 __multipath_free_dst);
+			}
+			list_del_rcu(&r->list);
+			call_rcu(&r->rcu,
+				 __multipath_free);
+		}
+
+		spin_unlock(&state[i].lock);
+	}
+}
+
+static struct ip_mp_alg_ops wrandom_ops = {
+	.mp_alg_select_route	=	wrandom_select_route,
+	.mp_alg_flush		=	wrandom_flush,
+	.mp_alg_set_nhinfo	=	wrandom_set_nhinfo,
+};
+
+static int __init wrandom_init(void)
+{
+	wrandom_init_state();
+
+	return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM);
+}
+
+static void __exit wrandom_exit(void)
+{
+	multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM);
+}
+
+module_init(wrandom_init);
+module_exit(wrandom_exit);
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
new file mode 100644
index 000000000000..46d4cb1c06f0
--- /dev/null
+++ b/net/ipv4/netfilter/Kconfig
@@ -0,0 +1,696 @@
+#
+# IP netfilter configuration
+#
+
+menu "IP: Netfilter Configuration"
+	depends on INET && NETFILTER
+
+# connection tracking, helpers and protocols
+config IP_NF_CONNTRACK
+	tristate "Connection tracking (required for masq/NAT)"
+	---help---
+	  Connection tracking keeps a record of what packets have passed
+	  through your machine, in order to figure out how they are related
+	  into connections.
+
+	  This is required to do Masquerading or other kinds of Network
+	  Address Translation (except for Fast NAT).  It can also be used to
+	  enhance packet filtering (see `Connection state match support'
+	  below).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_CT_ACCT
+	bool "Connection tracking flow accounting"
+	depends on IP_NF_CONNTRACK
+	help
+	  If this option is enabled, the connection tracking code will
+	  keep per-flow packet and byte counters.
+
+	  Those counters can be used for flow-based accounting or the
+	  `connbytes' match.
+
+	  If unsure, say `N'.
+
+config IP_NF_CONNTRACK_MARK
+	bool  'Connection mark tracking support'
+	help
+	  This option enables support for connection marks, used by the
+	  `CONNMARK' target and `connmark' match. Similar to the mark value
+	  of packets, but this mark value is kept in the conntrack session
+	  instead of the individual packets.
+	
+config IP_NF_CT_PROTO_SCTP
+	tristate  'SCTP protocol connection tracking support (EXPERIMENTAL)'
+	depends on IP_NF_CONNTRACK && EXPERIMENTAL
+	help
+	  With this option enabled, the connection tracking code will
+	  be able to do state tracking on SCTP connections.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config IP_NF_FTP
+	tristate "FTP protocol support"
+	depends on IP_NF_CONNTRACK
+	help
+	  Tracking FTP connections is problematic: special helpers are
+	  required for tracking them, and doing masquerading and other forms
+	  of Network Address Translation on them.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config IP_NF_IRC
+	tristate "IRC protocol support"
+	depends on IP_NF_CONNTRACK
+	---help---
+	  There is a commonly-used extension to IRC called
+	  Direct Client-to-Client Protocol (DCC).  This enables users to send
+	  files to each other, and also chat to each other without the need
+	  of a server.  DCC Sending is used anywhere you send files over IRC,
+	  and DCC Chat is most commonly used by Eggdrop bots.  If you are
+	  using NAT, this extension will enable you to send files and initiate
+	  chats.  Note that you do NOT need this extension to get files or
+	  have others initiate chats, or everything else in IRC.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config IP_NF_TFTP
+	tristate "TFTP protocol support"
+	depends on IP_NF_CONNTRACK
+	help
+	  TFTP connection tracking helper, this is required depending
+	  on how restrictive your ruleset is.
+	  If you are using a tftp client behind -j SNAT or -j MASQUERADING
+	  you will need this.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config IP_NF_AMANDA
+	tristate "Amanda backup protocol support"
+	depends on IP_NF_CONNTRACK
+	help
+	  If you are running the Amanda backup package <http://www.amanda.org/>
+	  on this machine or machines that will be MASQUERADED through this
+	  machine, then you may want to enable this feature.  This allows the
+	  connection tracking and natting code to allow the sub-channels that
+	  Amanda requires for communication of the backup data, messages and
+	  index.
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config IP_NF_QUEUE
+	tristate "Userspace queueing via NETLINK"
+	help
+	  Netfilter has the ability to queue packets to user space: the
+	  netlink device can be used to access them using this driver.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_IPTABLES
+	tristate "IP tables support (required for filtering/masq/NAT)"
+	help
+	  iptables is a general, extensible packet identification framework.
+	  The packet filtering and full NAT (masquerading, port forwarding,
+	  etc) subsystems now use this: say `Y' or `M' here if you want to use
+	  either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# The matches.
+config IP_NF_MATCH_LIMIT
+	tristate "limit match support"
+	depends on IP_NF_IPTABLES
+	help
+	  limit matching allows you to control the rate at which a rule can be
+	  matched: mainly useful in combination with the LOG target ("LOG
+	  target support", below) and to avoid some Denial of Service attacks.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_IPRANGE
+	tristate "IP range match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This option makes possible to match IP addresses against IP address
+	  ranges.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_MAC
+	tristate "MAC address match support"
+	depends on IP_NF_IPTABLES
+	help
+	  MAC matching allows you to match packets based on the source
+	  Ethernet address of the packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_PKTTYPE
+	tristate "Packet type match support"
+	depends on IP_NF_IPTABLES
+	help
+         Packet type matching allows you to match a packet by
+         its "class", eg. BROADCAST, MULTICAST, ...
+
+	  Typical usage:
+	  iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_MARK
+	tristate "netfilter MARK match support"
+	depends on IP_NF_IPTABLES
+	help
+	  Netfilter mark matching allows you to match packets based on the
+	  `nfmark' value in the packet.  This can be set by the MARK target
+	  (see below).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_MULTIPORT
+	tristate "Multiple port match support"
+	depends on IP_NF_IPTABLES
+	help
+	  Multiport matching allows you to match TCP or UDP packets based on
+	  a series of source or destination ports: normally a rule can only
+	  match a single range of ports.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_TOS
+	tristate "TOS match support"
+	depends on IP_NF_IPTABLES
+	help
+	  TOS matching allows you to match packets based on the Type Of
+	  Service fields of the IP packet.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_RECENT
+	tristate "recent match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This match is used for creating one or many lists of recently
+	  used addresses and then matching against that/those list(s).
+
+	  Short options are available by using 'iptables -m recent -h'
+	  Official Website: <http://snowman.net/projects/ipt_recent/>
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_ECN
+	tristate "ECN match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `ECN' match, which allows you to match against
+	  the IPv4 and TCP header ECN fields.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_DSCP
+	tristate "DSCP match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `DSCP' match, which allows you to match against
+	  the IPv4 header DSCP field (DSCP codepoint).
+
+	  The DSCP codepoint can have any value between 0x0 and 0x4f.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_AH_ESP
+	tristate "AH/ESP match support"
+	depends on IP_NF_IPTABLES
+	help
+	  These two match extensions (`ah' and `esp') allow you to match a
+	  range of SPIs inside AH or ESP headers of IPSec packets.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_LENGTH
+	tristate "LENGTH match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This option allows you to match the length of a packet against a
+	  specific value or range of values.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_TTL
+	tristate "TTL match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user
+	  to match packets by their TTL value.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_TCPMSS
+	tristate "tcpmss match support"
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `tcpmss' match, which allows you to examine the
+	  MSS value of TCP SYN packets, which control the maximum packet size
+	  for that connection.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_HELPER
+	tristate "Helper match support"
+	depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+	help
+	  Helper matching allows you to match packets in dynamic connections
+	  tracked by a conntrack-helper, ie. ip_conntrack_ftp
+
+	  To compile it as a module, choose M here.  If unsure, say Y.
+
+config IP_NF_MATCH_STATE
+	tristate "Connection state match support"
+	depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+	help
+	  Connection state matching allows you to match packets based on their
+	  relationship to a tracked connection (ie. previous packets).  This
+	  is a powerful tool for packet classification.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_CONNTRACK
+	tristate "Connection tracking match support"
+	depends on IP_NF_CONNTRACK && IP_NF_IPTABLES
+	help
+	  This is a general conntrack match module, a superset of the state match.
+
+	  It allows matching on additional conntrack information, which is
+	  useful in complex configurations, such as NAT gateways with multiple
+	  internet links or tunnels.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_OWNER
+	tristate "Owner match support"
+	depends on IP_NF_IPTABLES
+	help
+	  Packet owner matching allows you to match locally-generated packets
+	  based on who created them: the user, group, process or session.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_PHYSDEV
+	tristate "Physdev match support"
+	depends on IP_NF_IPTABLES && BRIDGE_NETFILTER
+	help
+	  Physdev packet matching matches against the physical bridge ports
+	  the IP packet arrived on or will leave by.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_MATCH_ADDRTYPE
+	tristate  'address type match support'
+	depends on IP_NF_IPTABLES
+	help
+	  This option allows you to match what routing thinks of an address,
+	  eg. UNICAST, LOCAL, BROADCAST, ...
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config IP_NF_MATCH_REALM
+	tristate  'realm match support'
+	depends on IP_NF_IPTABLES
+	select NET_CLS_ROUTE
+	help
+	  This option adds a `realm' match, which allows you to use the realm
+	  key from the routing subsystem inside iptables.
+	
+	  This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option 
+	  in tc world.
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config IP_NF_MATCH_SCTP
+	tristate  'SCTP protocol match support'
+	depends on IP_NF_IPTABLES
+	help
+	  With this option enabled, you will be able to use the iptables
+	  `sctp' match in order to match on SCTP source/destination ports
+	  and SCTP chunk types.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config IP_NF_MATCH_COMMENT
+	tristate  'comment match support'
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `comment' dummy-match, which allows you to put
+	  comments in your iptables ruleset.
+
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config IP_NF_MATCH_CONNMARK
+	tristate  'Connection mark match support'
+	depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES
+	help
+	  This option adds a `connmark' match, which allows you to match the
+	  connection mark value previously set for the session by `CONNMARK'. 
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  The module will be called
+	  ipt_connmark.o.  If unsure, say `N'.
+
+config IP_NF_MATCH_HASHLIMIT
+	tristate  'hashlimit match support'
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a new iptables `hashlimit' match.  
+
+	  As opposed to `limit', this match dynamically crates a hash table
+	  of limit buckets, based on your selection of source/destination
+	  ip addresses and/or ports.
+
+	  It enables you to express policies like `10kpps for any given
+	  destination IP' or `500pps from any given source IP'  with a single
+	  IPtables rule.
+
+# `filter', generic and specific targets
+config IP_NF_FILTER
+	tristate "Packet filtering"
+	depends on IP_NF_IPTABLES
+	help
+	  Packet filtering defines a table `filter', which has a series of
+	  rules for simple packet filtering at local input, forwarding and
+	  local output.  See the man page for iptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_REJECT
+	tristate "REJECT target support"
+	depends on IP_NF_FILTER
+	help
+	  The REJECT target allows a filtering rule to specify that an ICMP
+	  error should be issued in response to an incoming packet, rather
+	  than silently being dropped.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_LOG
+	tristate "LOG target support"
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `LOG' target, which allows you to create rules in
+	  any iptables table which records the packet header to the syslog.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_ULOG
+	tristate "ULOG target support"
+	depends on IP_NF_IPTABLES
+	---help---
+	  This option adds a `ULOG' target, which allows you to create rules in
+	  any iptables table. The packet is passed to a userspace logging
+	  daemon using netlink multicast sockets; unlike the LOG target
+	  which can only be viewed through syslog.
+
+	  The apropriate userspace logging daemon (ulogd) may be obtained from
+	  <http://www.gnumonks.org/projects/ulogd/>
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_TCPMSS
+	tristate "TCPMSS target support"
+	depends on IP_NF_IPTABLES
+	---help---
+	  This option adds a `TCPMSS' target, which allows you to alter the
+	  MSS value of TCP SYN packets, to control the maximum size for that
+	  connection (usually limiting it to your outgoing interface's MTU
+	  minus 40).
+
+	  This is used to overcome criminally braindead ISPs or servers which
+	  block ICMP Fragmentation Needed packets.  The symptoms of this
+	  problem are that everything works fine from your Linux
+	  firewall/router, but machines behind it can never exchange large
+	  packets:
+	  	1) Web browsers connect, then hang with no data received.
+	  	2) Small mail works fine, but large emails hang.
+	  	3) ssh works fine, but scp hangs after initial handshaking.
+
+	  Workaround: activate this option and add a rule to your firewall
+	  configuration like:
+
+	  iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \
+	  		 -j TCPMSS --clamp-mss-to-pmtu
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# NAT + specific targets
+config IP_NF_NAT
+	tristate "Full NAT"
+	depends on IP_NF_IPTABLES && IP_NF_CONNTRACK
+	help
+	  The Full NAT option allows masquerading, port forwarding and other
+	  forms of full Network Address Port Translation.  It is controlled by
+	  the `nat' table in iptables: see the man page for iptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_NAT_NEEDED
+	bool
+	depends on IP_NF_NAT != n
+	default y
+
+config IP_NF_TARGET_MASQUERADE
+	tristate "MASQUERADE target support"
+	depends on IP_NF_NAT
+	help
+	  Masquerading is a special case of NAT: all outgoing connections are
+	  changed to seem to come from a particular interface's address, and
+	  if the interface goes down, those connections are lost.  This is
+	  only useful for dialup accounts with dynamic IP address (ie. your IP
+	  address will be different on next dialup).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_REDIRECT
+	tristate "REDIRECT target support"
+	depends on IP_NF_NAT
+	help
+	  REDIRECT is a special case of NAT: all incoming connections are
+	  mapped onto the incoming interface's address, causing the packets to
+	  come to the local machine instead of passing through.  This is
+	  useful for transparent proxies.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_NETMAP
+	tristate "NETMAP target support"
+	depends on IP_NF_NAT
+	help
+	  NETMAP is an implementation of static 1:1 NAT mapping of network
+	  addresses. It maps the network address part, while keeping the host
+	  address part intact. It is similar to Fast NAT, except that
+	  Netfilter's connection tracking doesn't work well with Fast NAT.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_SAME
+	tristate "SAME target support"
+	depends on IP_NF_NAT
+	help
+	  This option adds a `SAME' target, which works like the standard SNAT
+	  target, but attempts to give clients the same IP for all connections.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_NAT_SNMP_BASIC
+	tristate "Basic SNMP-ALG support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && IP_NF_NAT
+	---help---
+
+	  This module implements an Application Layer Gateway (ALG) for
+	  SNMP payloads.  In conjunction with NAT, it allows a network
+	  management system to access multiple private networks with
+	  conflicting addresses.  It works by modifying IP addresses
+	  inside SNMP payloads to match IP-layer NAT mapping.
+
+	  This is the "basic" form of SNMP-ALG, as described in RFC 2962
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_NAT_IRC
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_IRC=y
+	default m if IP_NF_IRC=m
+
+# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), 
+# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker.  Argh.
+config IP_NF_NAT_FTP
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_FTP=y
+	default m if IP_NF_FTP=m
+
+config IP_NF_NAT_TFTP
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_TFTP=y
+	default m if IP_NF_TFTP=m
+
+config IP_NF_NAT_AMANDA
+	tristate
+	depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n
+	default IP_NF_NAT if IP_NF_AMANDA=y
+	default m if IP_NF_AMANDA=m
+
+# mangle + specific targets
+config IP_NF_MANGLE
+	tristate "Packet mangling"
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `mangle' table to iptables: see the man page for
+	  iptables(8).  This table is used for various packet alterations
+	  which can effect how the packet is routed.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_TOS
+	tristate "TOS target support"
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `TOS' target, which allows you to create rules in
+	  the `mangle' table which alter the Type Of Service field of an IP
+	  packet prior to routing.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_ECN
+	tristate "ECN target support"
+	depends on IP_NF_MANGLE
+	---help---
+	  This option adds a `ECN' target, which can be used in the iptables mangle
+	  table.  
+
+	  You can use this target to remove the ECN bits from the IPv4 header of
+	  an IP packet.  This is particularly useful, if you need to work around
+	  existing ECN blackholes on the internet, but don't want to disable
+	  ECN support in general.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_DSCP
+	tristate "DSCP target support"
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `DSCP' match, which allows you to match against
+	  the IPv4 header DSCP field (DSCP codepoint).
+
+	  The DSCP codepoint can have any value between 0x0 and 0x4f.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_MARK
+	tristate "MARK target support"
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `MARK' target, which allows you to create rules
+	  in the `mangle' table which alter the netfilter mark (nfmark) field
+	  associated with the packet prior to routing. This can change
+	  the routing method (see `Use netfilter MARK value as routing
+	  key') and can also be used by other subsystems to change their
+	  behavior.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_CLASSIFY
+	tristate "CLASSIFY target support"
+	depends on IP_NF_MANGLE
+	help
+	  This option adds a `CLASSIFY' target, which enables the user to set
+	  the priority of a packet. Some qdiscs can use this value for
+	  classification, among these are:
+
+  	  atm, cbq, dsmark, pfifo_fast, htb, prio
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_TARGET_CONNMARK
+	tristate  'CONNMARK target support'
+	depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE
+	help
+	  This option adds a `CONNMARK' target, which allows one to manipulate
+	  the connection mark value.  Similar to the MARK target, but
+	  affects the connection mark value rather than the packet mark value.
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  The module will be called
+	  ipt_CONNMARK.o.  If unsure, say `N'.
+
+config IP_NF_TARGET_CLUSTERIP
+	tristate "CLUSTERIP target support (EXPERIMENTAL)"
+	depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL
+	help
+	  The CLUSTERIP target allows you to build load-balancing clusters of
+	  network servers without having a dedicated load-balancing
+	  router/server/switch.
+	
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+# raw + specific targets
+config IP_NF_RAW
+	tristate  'raw table support (required for NOTRACK/TRACE)'
+	depends on IP_NF_IPTABLES
+	help
+	  This option adds a `raw' table to iptables. This table is the very
+	  first in the netfilter framework and hooks in at the PREROUTING
+	  and OUTPUT chains.
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+config IP_NF_TARGET_NOTRACK
+	tristate  'NOTRACK target support'
+	depends on IP_NF_RAW
+	depends on IP_NF_CONNTRACK
+	help
+	  The NOTRACK target allows a select rule to specify
+	  which packets *not* to enter the conntrack/NAT
+	  subsystem with all the consequences (no ICMP error tracking,
+	  no protocol helpers for the selected packets).
+	
+	  If you want to compile it as a module, say M here and read
+	  <file:Documentation/modules.txt>.  If unsure, say `N'.
+
+
+# ARP tables
+config IP_NF_ARPTABLES
+	tristate "ARP tables support"
+	help
+	  arptables is a general, extensible packet identification framework.
+	  The ARP packet filtering and mangling (manipulation)subsystems
+	  use this: say Y or M here if you want to use either of those.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_ARPFILTER
+	tristate "ARP packet filtering"
+	depends on IP_NF_ARPTABLES
+	help
+	  ARP packet filtering defines a table `filter', which has a series of
+	  rules for simple ARP packet filtering at local input and
+	  local output.  On a bridge, you can also specify filtering rules
+	  for forwarded ARP packets. See the man page for arptables(8).
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
+config IP_NF_ARP_MANGLE
+	tristate "ARP payload mangling"
+	depends on IP_NF_ARPTABLES
+	help
+	  Allows altering the ARP packet payload: source and destination
+	  hardware and network addresses.
+
+endmenu
+
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
new file mode 100644
index 000000000000..45796d5924dd
--- /dev/null
+++ b/net/ipv4/netfilter/Makefile
@@ -0,0 +1,89 @@
+#
+# Makefile for the netfilter modules on top of IPv4.
+#
+
+# objects for the standalone - connection tracking / NAT
+ip_conntrack-objs	:= ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
+iptable_nat-objs	:= ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+
+# connection tracking
+obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o
+
+# SCTP protocol connection tracking
+obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o
+
+# connection tracking helpers
+obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o
+obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o
+obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o
+obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o
+
+# NAT helpers 
+obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o
+obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o
+obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o
+obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o
+
+# generic IP tables 
+obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+
+# the three instances of ip_tables
+obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
+obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
+
+# matches
+obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o
+obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o
+obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o
+obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o
+obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o
+obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o
+obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o
+obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o
+obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o
+obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o
+obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o
+obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o
+obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o
+obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o
+obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o
+obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o
+obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o
+obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o
+obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o
+obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o
+obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o
+obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
+obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
+obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
+
+# targets
+obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o
+obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o
+obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o
+obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o
+obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o
+obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o
+obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o
+obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o
+obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o
+obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o
+obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o
+obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o
+obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o
+obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o
+obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
+obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
+obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
+
+# generic ARP tables
+obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
+obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
+
+# just filtering instance of ARP tables for now
+obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
+
+obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
new file mode 100644
index 000000000000..df79f5ed6a0a
--- /dev/null
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -0,0 +1,1333 @@
+/*
+ * Packet matching code for ARP packets.
+ *
+ * Based heavily, if not almost entirely, upon ip_tables.c framework.
+ *
+ * Some ARP specific bits are:
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+
+#include <linux/netfilter_arp/arp_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables core");
+
+/*#define DEBUG_ARP_TABLES*/
+/*#define DEBUG_ARP_TABLES_USER*/
+
+#ifdef DEBUG_ARP_TABLES
+#define dprintf(format, args...)  printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_ARP_TABLES_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define ARP_NF_ASSERT(x)					\
+do {								\
+	if (!(x))						\
+		printk("ARP_NF_ASSERT: %s:%s:%u\n",		\
+		       __FUNCTION__, __FILE__, __LINE__);	\
+} while(0)
+#else
+#define ARP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+static DECLARE_MUTEX(arpt_mutex);
+
+#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+struct arpt_table_info {
+	unsigned int size;
+	unsigned int number;
+	unsigned int initial_entries;
+	unsigned int hook_entry[NF_ARP_NUMHOOKS];
+	unsigned int underflow[NF_ARP_NUMHOOKS];
+	char entries[0] __attribute__((aligned(SMP_CACHE_BYTES)));
+};
+
+static LIST_HEAD(arpt_target);
+static LIST_HEAD(arpt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+
+static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
+				      char *hdr_addr, int len)
+{
+	int i, ret;
+
+	if (len > ARPT_DEV_ADDR_LEN_MAX)
+		len = ARPT_DEV_ADDR_LEN_MAX;
+
+	ret = 0;
+	for (i = 0; i < len; i++)
+		ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
+
+	return (ret != 0);
+}
+
+/* Returns whether packet matches rule or not. */
+static inline int arp_packet_match(const struct arphdr *arphdr,
+				   struct net_device *dev,
+				   const char *indev,
+				   const char *outdev,
+				   const struct arpt_arp *arpinfo)
+{
+	char *arpptr = (char *)(arphdr + 1);
+	char *src_devaddr, *tgt_devaddr;
+	u32 src_ipaddr, tgt_ipaddr;
+	int i, ret;
+
+#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg))
+
+	if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
+		  ARPT_INV_ARPOP)) {
+		dprintf("ARP operation field mismatch.\n");
+		dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n",
+			arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask);
+		return 0;
+	}
+
+	if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd,
+		  ARPT_INV_ARPHRD)) {
+		dprintf("ARP hardware address format mismatch.\n");
+		dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n",
+			arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask);
+		return 0;
+	}
+
+	if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro,
+		  ARPT_INV_ARPPRO)) {
+		dprintf("ARP protocol address format mismatch.\n");
+		dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n",
+			arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask);
+		return 0;
+	}
+
+	if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln,
+		  ARPT_INV_ARPHLN)) {
+		dprintf("ARP hardware address length mismatch.\n");
+		dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n",
+			arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask);
+		return 0;
+	}
+
+	src_devaddr = arpptr;
+	arpptr += dev->addr_len;
+	memcpy(&src_ipaddr, arpptr, sizeof(u32));
+	arpptr += sizeof(u32);
+	tgt_devaddr = arpptr;
+	arpptr += dev->addr_len;
+	memcpy(&tgt_ipaddr, arpptr, sizeof(u32));
+
+	if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len),
+		  ARPT_INV_SRCDEVADDR) ||
+	    FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len),
+		  ARPT_INV_TGTDEVADDR)) {
+		dprintf("Source or target device address mismatch.\n");
+
+		return 0;
+	}
+
+	if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr,
+		  ARPT_INV_SRCIP) ||
+	    FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr),
+		  ARPT_INV_TGTIP)) {
+		dprintf("Source or target IP address mismatch.\n");
+
+		dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+			NIPQUAD(src_ipaddr),
+			NIPQUAD(arpinfo->smsk.s_addr),
+			NIPQUAD(arpinfo->src.s_addr),
+			arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : "");
+		dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+			NIPQUAD(tgt_ipaddr),
+			NIPQUAD(arpinfo->tmsk.s_addr),
+			NIPQUAD(arpinfo->tgt.s_addr),
+			arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : "");
+		return 0;
+	}
+
+	/* Look for ifname matches.  */
+	for (i = 0, ret = 0; i < IFNAMSIZ; i++) {
+		ret |= (indev[i] ^ arpinfo->iniface[i])
+			& arpinfo->iniface_mask[i];
+	}
+
+	if (FWINV(ret != 0, ARPT_INV_VIA_IN)) {
+		dprintf("VIA in mismatch (%s vs %s).%s\n",
+			indev, arpinfo->iniface,
+			arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":"");
+		return 0;
+	}
+
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+		unsigned long odev;
+		memcpy(&odev, outdev + i*sizeof(unsigned long),
+		       sizeof(unsigned long));
+		ret |= (odev
+			^ ((const unsigned long *)arpinfo->outiface)[i])
+			& ((const unsigned long *)arpinfo->outiface_mask)[i];
+	}
+
+	if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) {
+		dprintf("VIA out mismatch (%s vs %s).%s\n",
+			outdev, arpinfo->outiface,
+			arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":"");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int arp_checkentry(const struct arpt_arp *arp)
+{
+	if (arp->flags & ~ARPT_F_MASK) {
+		duprintf("Unknown flag bits set: %08X\n",
+			 arp->flags & ~ARPT_F_MASK);
+		return 0;
+	}
+	if (arp->invflags & ~ARPT_INV_MASK) {
+		duprintf("Unknown invflag bits set: %08X\n",
+			 arp->invflags & ~ARPT_INV_MASK);
+		return 0;
+	}
+
+	return 1;
+}
+
+static unsigned int arpt_error(struct sk_buff **pskb,
+			       unsigned int hooknum,
+			       const struct net_device *in,
+			       const struct net_device *out,
+			       const void *targinfo,
+			       void *userinfo)
+{
+	if (net_ratelimit())
+		printk("arp_tables: error: '%s'\n", (char *)targinfo);
+
+	return NF_DROP;
+}
+
+static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
+{
+	return (struct arpt_entry *)(base + offset);
+}
+
+unsigned int arpt_do_table(struct sk_buff **pskb,
+			   unsigned int hook,
+			   const struct net_device *in,
+			   const struct net_device *out,
+			   struct arpt_table *table,
+			   void *userdata)
+{
+	static const char nulldevname[IFNAMSIZ];
+	unsigned int verdict = NF_DROP;
+	struct arphdr *arp;
+	int hotdrop = 0;
+	struct arpt_entry *e, *back;
+	const char *indev, *outdev;
+	void *table_base;
+
+	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+	if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) +
+				     (2 * (*pskb)->dev->addr_len) +
+				     (2 * sizeof(u32)))))
+		return NF_DROP;
+
+	indev = in ? in->name : nulldevname;
+	outdev = out ? out->name : nulldevname;
+
+	read_lock_bh(&table->lock);
+	table_base = (void *)table->private->entries
+		+ TABLE_OFFSET(table->private,
+			       smp_processor_id());
+	e = get_entry(table_base, table->private->hook_entry[hook]);
+	back = get_entry(table_base, table->private->underflow[hook]);
+
+	arp = (*pskb)->nh.arph;
+	do {
+		if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) {
+			struct arpt_entry_target *t;
+			int hdr_len;
+
+			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
+				(2 * (*pskb)->dev->addr_len);
+			ADD_COUNTER(e->counters, hdr_len, 1);
+
+			t = arpt_get_target(e);
+
+			/* Standard target? */
+			if (!t->u.kernel.target->target) {
+				int v;
+
+				v = ((struct arpt_standard_target *)t)->verdict;
+				if (v < 0) {
+					/* Pop from stack? */
+					if (v != ARPT_RETURN) {
+						verdict = (unsigned)(-v) - 1;
+						break;
+					}
+					e = back;
+					back = get_entry(table_base,
+							 back->comefrom);
+					continue;
+				}
+				if (table_base + v
+				    != (void *)e + e->next_offset) {
+					/* Save old back ptr in next entry */
+					struct arpt_entry *next
+						= (void *)e + e->next_offset;
+					next->comefrom =
+						(void *)back - table_base;
+
+					/* set back pointer to next entry */
+					back = next;
+				}
+
+				e = get_entry(table_base, v);
+			} else {
+				/* Targets which reenter must return
+				 * abs. verdicts
+				 */
+				verdict = t->u.kernel.target->target(pskb,
+								     hook,
+								     in, out,
+								     t->data,
+								     userdata);
+
+				/* Target might have changed stuff. */
+				arp = (*pskb)->nh.arph;
+
+				if (verdict == ARPT_CONTINUE)
+					e = (void *)e + e->next_offset;
+				else
+					/* Verdict */
+					break;
+			}
+		} else {
+			e = (void *)e + e->next_offset;
+		}
+	} while (!hotdrop);
+	read_unlock_bh(&table->lock);
+
+	if (hotdrop)
+		return NF_DROP;
+	else
+		return verdict;
+}
+
+static inline void *find_inlist_lock_noload(struct list_head *head,
+					    const char *name,
+					    int *error,
+					    struct semaphore *mutex)
+{
+	void *ret;
+
+	*error = down_interruptible(mutex);
+	if (*error != 0)
+		return NULL;
+
+	ret = list_named_find(head, name);
+	if (!ret) {
+		*error = -ENOENT;
+		up(mutex);
+	}
+	return ret;
+}
+
+#ifndef CONFIG_KMOD
+#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m))
+#else
+static void *
+find_inlist_lock(struct list_head *head,
+		 const char *name,
+		 const char *prefix,
+		 int *error,
+		 struct semaphore *mutex)
+{
+	void *ret;
+
+	ret = find_inlist_lock_noload(head, name, error, mutex);
+	if (!ret) {
+		duprintf("find_inlist: loading `%s%s'.\n", prefix, name);
+		request_module("%s%s", prefix, name);
+		ret = find_inlist_lock_noload(head, name, error, mutex);
+	}
+
+	return ret;
+}
+#endif
+
+static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex)
+{
+	return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex);
+}
+
+static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex)
+{
+	return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex);
+}
+
+/* All zeroes == unconditional rule. */
+static inline int unconditional(const struct arpt_arp *arp)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++)
+		if (((__u32 *)arp)[i])
+			return 0;
+
+	return 1;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+ * there are loops.  Puts hook bitmask in comefrom.
+ */
+static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	 * to 0 as we leave), and comefrom to save source hook bitmask.
+	 */
+	for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct arpt_entry *e
+			= (struct arpt_entry *)(newinfo->entries + pos);
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			struct arpt_standard_target *t
+				= (void *)arpt_get_target(e);
+
+			if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
+				printk("arptables: loop hook %u pos %u %08X.\n",
+				       hook, pos, e->comefrom);
+				return 0;
+			}
+			e->comefrom
+				|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if (e->target_offset == sizeof(struct arpt_entry)
+			    && (strcmp(t->target.u.user.name,
+				       ARPT_STANDARD_TARGET) == 0)
+			    && t->verdict < 0
+			    && unconditional(&e->arp)) {
+				unsigned int oldpos, size;
+
+				/* Return: backtrack through the last
+				 * big jump.
+				 */
+				do {
+					e->comefrom ^= (1<<NF_ARP_NUMHOOKS);
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = (struct arpt_entry *)
+						(newinfo->entries + pos);
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = (struct arpt_entry *)
+					(newinfo->entries + pos + size);
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   ARPT_STANDARD_TARGET) == 0
+				    && newpos >= 0) {
+					/* This a jump; chase it. */
+					duprintf("Jump rule %u -> %u\n",
+						 pos, newpos);
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+				}
+				e = (struct arpt_entry *)
+					(newinfo->entries + newpos);
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+		next:
+		duprintf("Finished chain %u\n", hook);
+	}
+	return 1;
+}
+
+static inline int standard_check(const struct arpt_entry_target *t,
+				 unsigned int max_offset)
+{
+	struct arpt_standard_target *targ = (void *)t;
+
+	/* Check standard info. */
+	if (t->u.target_size
+	    != ARPT_ALIGN(sizeof(struct arpt_standard_target))) {
+		duprintf("arpt_standard_check: target size %u != %Zu\n",
+			 t->u.target_size,
+			 ARPT_ALIGN(sizeof(struct arpt_standard_target)));
+		return 0;
+	}
+
+	if (targ->verdict >= 0
+	    && targ->verdict > max_offset - sizeof(struct arpt_entry)) {
+		duprintf("arpt_standard_check: bad verdict (%i)\n",
+			 targ->verdict);
+		return 0;
+	}
+
+	if (targ->verdict < -NF_MAX_VERDICT - 1) {
+		duprintf("arpt_standard_check: bad negative verdict (%i)\n",
+			 targ->verdict);
+		return 0;
+	}
+	return 1;
+}
+
+static struct arpt_target arpt_standard_target;
+
+static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size,
+			      unsigned int *i)
+{
+	struct arpt_entry_target *t;
+	struct arpt_target *target;
+	int ret;
+
+	if (!arp_checkentry(&e->arp)) {
+		duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+		return -EINVAL;
+	}
+
+	t = arpt_get_target(e);
+	target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex);
+	if (!target) {
+		duprintf("check_entry: `%s' not found\n", t->u.user.name);
+		goto out;
+	}
+	if (!try_module_get((target->me))) {
+		ret = -ENOENT;
+		goto out_unlock;
+	}
+	t->u.kernel.target = target;
+	up(&arpt_mutex);
+
+	if (t->u.kernel.target == &arpt_standard_target) {
+		if (!standard_check(t, size)) {
+			ret = -EINVAL;
+			goto out;
+		}
+	} else if (t->u.kernel.target->checkentry
+		   && !t->u.kernel.target->checkentry(name, e, t->data,
+						      t->u.target_size
+						      - sizeof(*t),
+						      e->comefrom)) {
+		module_put(t->u.kernel.target->me);
+		duprintf("arp_tables: check failed for `%s'.\n",
+			 t->u.kernel.target->name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	(*i)++;
+	return 0;
+
+out_unlock:
+	up(&arpt_mutex);
+out:
+	return ret;
+}
+
+static inline int check_entry_size_and_hooks(struct arpt_entry *e,
+					     struct arpt_table_info *newinfo,
+					     unsigned char *base,
+					     unsigned char *limit,
+					     const unsigned int *hook_entries,
+					     const unsigned int *underflows,
+					     unsigned int *i)
+{
+	unsigned int h;
+
+	if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
+	    || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+		duprintf("Bad offset %p\n", e);
+		return -EINVAL;
+	}
+
+	if (e->next_offset
+	    < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h])
+			newinfo->underflow[h] = underflows[h];
+	}
+
+	/* FIXME: underflows must be unconditional, standard verdicts
+           < 0 (not ARPT_RETURN). --RR */
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct arpt_counters) { 0, 0 });
+	e->comefrom = 0;
+
+	(*i)++;
+	return 0;
+}
+
+static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
+{
+	struct arpt_entry_target *t;
+
+	if (i && (*i)-- == 0)
+		return 1;
+
+	t = arpt_get_target(e);
+	if (t->u.kernel.target->destroy)
+		t->u.kernel.target->destroy(t->data,
+					    t->u.target_size - sizeof(*t));
+	module_put(t->u.kernel.target->me);
+	return 0;
+}
+
+/* Checks and translates the user-supplied table segment (held in
+ * newinfo).
+ */
+static int translate_table(const char *name,
+			   unsigned int valid_hooks,
+			   struct arpt_table_info *newinfo,
+			   unsigned int size,
+			   unsigned int number,
+			   const unsigned int *hook_entries,
+			   const unsigned int *underflows)
+{
+	unsigned int i;
+	int ret;
+
+	newinfo->size = size;
+	newinfo->number = number;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_table: size %u\n", newinfo->size);
+	i = 0;
+
+	/* Walk through entries, checking offsets. */
+	ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+				 check_entry_size_and_hooks,
+				 newinfo,
+				 newinfo->entries,
+				 newinfo->entries + size,
+				 hook_entries, underflows, &i);
+	duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
+	if (ret != 0)
+		return ret;
+
+	if (i != number) {
+		duprintf("translate_table: %u not %u entries\n",
+			 i, number);
+		return -EINVAL;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(valid_hooks & (1 << i)))
+			continue;
+		if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, hook_entries[i]);
+			return -EINVAL;
+		}
+		if (newinfo->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, underflows[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (!mark_source_chains(newinfo, valid_hooks)) {
+		duprintf("Looping hook\n");
+		return -ELOOP;
+	}
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+				 check_entry, name, size, &i);
+
+	if (ret != 0) {
+		ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+				   cleanup_entry, &i);
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for (i = 1; i < num_possible_cpus(); i++) {
+		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+		       newinfo->entries,
+		       SMP_ALIGN(newinfo->size));
+	}
+
+	return ret;
+}
+
+static struct arpt_table_info *replace_table(struct arpt_table *table,
+					     unsigned int num_counters,
+					     struct arpt_table_info *newinfo,
+					     int *error)
+{
+	struct arpt_table_info *oldinfo;
+
+	/* Do the substitution. */
+	write_lock_bh(&table->lock);
+	/* Check inside lock: is the old number correct? */
+	if (num_counters != table->private->number) {
+		duprintf("num_counters != table->private->number (%u/%u)\n",
+			 num_counters, table->private->number);
+		write_unlock_bh(&table->lock);
+		*error = -EAGAIN;
+		return NULL;
+	}
+	oldinfo = table->private;
+	table->private = newinfo;
+	newinfo->initial_entries = oldinfo->initial_entries;
+	write_unlock_bh(&table->lock);
+
+	return oldinfo;
+}
+
+/* Gets counters. */
+static inline int add_entry_to_counter(const struct arpt_entry *e,
+				       struct arpt_counters total[],
+				       unsigned int *i)
+{
+	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+static void get_counters(const struct arpt_table_info *t,
+			 struct arpt_counters counters[])
+{
+	unsigned int cpu;
+	unsigned int i;
+
+	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+		i = 0;
+		ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+				   t->size,
+				   add_entry_to_counter,
+				   counters,
+				   &i);
+	}
+}
+
+static int copy_entries_to_user(unsigned int total_size,
+				struct arpt_table *table,
+				void __user *userptr)
+{
+	unsigned int off, num, countersize;
+	struct arpt_entry *e;
+	struct arpt_counters *counters;
+	int ret = 0;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	 * (other than comefrom, which userspace doesn't care
+	 * about).
+	 */
+	countersize = sizeof(struct arpt_counters) * table->private->number;
+	counters = vmalloc(countersize);
+
+	if (counters == NULL)
+		return -ENOMEM;
+
+	/* First, sum counters... */
+	memset(counters, 0, countersize);
+	write_lock_bh(&table->lock);
+	get_counters(table->private, counters);
+	write_unlock_bh(&table->lock);
+
+	/* ... then copy entire thing from CPU 0... */
+	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+		ret = -EFAULT;
+		goto free_counters;
+	}
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		struct arpt_entry_target *t;
+
+		e = (struct arpt_entry *)(table->private->entries + off);
+		if (copy_to_user(userptr + off
+				 + offsetof(struct arpt_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		t = arpt_get_target(e);
+		if (copy_to_user(userptr + off + e->target_offset
+				 + offsetof(struct arpt_entry_target,
+					    u.user.name),
+				 t->u.kernel.target->name,
+				 strlen(t->u.kernel.target->name)+1) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+static int get_entries(const struct arpt_get_entries *entries,
+		       struct arpt_get_entries __user *uptr)
+{
+	int ret;
+	struct arpt_table *t;
+
+	t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex);
+	if (t) {
+		duprintf("t->private->number = %u\n",
+			 t->private->number);
+		if (entries->size == t->private->size)
+			ret = copy_entries_to_user(t->private->size,
+						   t, uptr->entrytable);
+		else {
+			duprintf("get_entries: I've got %u not %u!\n",
+				 t->private->size,
+				 entries->size);
+			ret = -EINVAL;
+		}
+		up(&arpt_mutex);
+	} else
+		duprintf("get_entries: Can't find %s!\n",
+			 entries->name);
+
+	return ret;
+}
+
+static int do_replace(void __user *user, unsigned int len)
+{
+	int ret;
+	struct arpt_replace tmp;
+	struct arpt_table *t;
+	struct arpt_table_info *newinfo, *oldinfo;
+	struct arpt_counters *counters;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* Hack: Causes ipchains to give correct error msg --RR */
+	if (len != sizeof(tmp) + tmp.size)
+		return -ENOPROTOOPT;
+
+	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+		return -ENOMEM;
+
+	newinfo = vmalloc(sizeof(struct arpt_table_info)
+			  + SMP_ALIGN(tmp.size) * num_possible_cpus());
+	if (!newinfo)
+		return -ENOMEM;
+
+	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters));
+	if (!counters) {
+		ret = -ENOMEM;
+		goto free_newinfo;
+	}
+	memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters));
+
+	ret = translate_table(tmp.name, tmp.valid_hooks,
+			      newinfo, tmp.size, tmp.num_entries,
+			      tmp.hook_entry, tmp.underflow);
+	if (ret != 0)
+		goto free_newinfo_counters;
+
+	duprintf("arp_tables: Translated table\n");
+
+	t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
+	if (!t)
+		goto free_newinfo_counters_untrans;
+
+	/* You lied! */
+	if (tmp.valid_hooks != t->valid_hooks) {
+		duprintf("Valid hook crap: %08X vs %08X\n",
+			 tmp.valid_hooks, t->valid_hooks);
+		ret = -EINVAL;
+		goto free_newinfo_counters_untrans_unlock;
+	}
+
+	/* Get a reference in advance, we're not allowed fail later */
+	if (!try_module_get(t->me)) {
+		ret = -EBUSY;
+		goto free_newinfo_counters_untrans_unlock;
+	}
+
+	oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+		oldinfo->number, oldinfo->initial_entries, newinfo->number);
+	if ((oldinfo->number > oldinfo->initial_entries) || 
+	    (newinfo->number <= oldinfo->initial_entries)) 
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	/* Get the old counters. */
+	get_counters(oldinfo, counters);
+	/* Decrease module usage counts and free resource */
+	ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+	vfree(oldinfo);
+	if (copy_to_user(tmp.counters, counters,
+			 sizeof(struct arpt_counters) * tmp.num_counters) != 0)
+		ret = -EFAULT;
+	vfree(counters);
+	up(&arpt_mutex);
+	return ret;
+
+ put_module:
+	module_put(t->me);
+ free_newinfo_counters_untrans_unlock:
+	up(&arpt_mutex);
+ free_newinfo_counters_untrans:
+	ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL);
+ free_newinfo_counters:
+	vfree(counters);
+ free_newinfo:
+	vfree(newinfo);
+	return ret;
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK.
+ */
+static inline int add_counter_to_entry(struct arpt_entry *e,
+				       const struct arpt_counters addme[],
+				       unsigned int *i)
+{
+
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+static int do_add_counters(void __user *user, unsigned int len)
+{
+	unsigned int i;
+	struct arpt_counters_info tmp, *paddc;
+	struct arpt_table *t;
+	int ret;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters))
+		return -EINVAL;
+
+	paddc = vmalloc(len);
+	if (!paddc)
+		return -ENOMEM;
+
+	if (copy_from_user(paddc, user, len) != 0) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex);
+	if (!t)
+		goto free;
+
+	write_lock_bh(&t->lock);
+	if (t->private->number != paddc->num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->private->entries,
+			   t->private->size,
+			   add_counter_to_entry,
+			   paddc->counters,
+			   &i);
+ unlock_up_free:
+	write_unlock_bh(&t->lock);
+	up(&arpt_mutex);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_SET_REPLACE:
+		ret = do_replace(user, len);
+		break;
+
+	case ARPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(user, len);
+		break;
+
+	default:
+		duprintf("do_arpt_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case ARPT_SO_GET_INFO: {
+		char name[ARPT_TABLE_MAXNAMELEN];
+		struct arpt_table *t;
+
+		if (*len != sizeof(struct arpt_getinfo)) {
+			duprintf("length %u != %Zu\n", *len,
+				 sizeof(struct arpt_getinfo));
+			ret = -EINVAL;
+			break;
+		}
+
+		if (copy_from_user(name, user, sizeof(name)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+		t = arpt_find_table_lock(name, &ret, &arpt_mutex);
+		if (t) {
+			struct arpt_getinfo info;
+
+			info.valid_hooks = t->valid_hooks;
+			memcpy(info.hook_entry, t->private->hook_entry,
+			       sizeof(info.hook_entry));
+			memcpy(info.underflow, t->private->underflow,
+			       sizeof(info.underflow));
+			info.num_entries = t->private->number;
+			info.size = t->private->size;
+			strcpy(info.name, name);
+
+			if (copy_to_user(user, &info, *len) != 0)
+				ret = -EFAULT;
+			else
+				ret = 0;
+
+			up(&arpt_mutex);
+		}
+	}
+	break;
+
+	case ARPT_SO_GET_ENTRIES: {
+		struct arpt_get_entries get;
+
+		if (*len < sizeof(get)) {
+			duprintf("get_entries: %u < %Zu\n", *len, sizeof(get));
+			ret = -EINVAL;
+		} else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+			ret = -EFAULT;
+		} else if (*len != sizeof(struct arpt_get_entries) + get.size) {
+			duprintf("get_entries: %u != %Zu\n", *len,
+				 sizeof(struct arpt_get_entries) + get.size);
+			ret = -EINVAL;
+		} else
+			ret = get_entries(&get, user);
+		break;
+	}
+
+	default:
+		duprintf("do_arpt_get_ctl: unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/* Registration hooks for targets. */
+int arpt_register_target(struct arpt_target *target)
+{
+	int ret;
+
+	ret = down_interruptible(&arpt_mutex);
+	if (ret != 0)
+		return ret;
+
+	if (!list_named_insert(&arpt_target, target)) {
+		duprintf("arpt_register_target: `%s' already in list!\n",
+			 target->name);
+		ret = -EINVAL;
+	}
+	up(&arpt_mutex);
+	return ret;
+}
+
+void arpt_unregister_target(struct arpt_target *target)
+{
+	down(&arpt_mutex);
+	LIST_DELETE(&arpt_target, target);
+	up(&arpt_mutex);
+}
+
+int arpt_register_table(struct arpt_table *table,
+			const struct arpt_replace *repl)
+{
+	int ret;
+	struct arpt_table_info *newinfo;
+	static struct arpt_table_info bootstrap
+		= { 0, 0, 0, { 0 }, { 0 }, { } };
+
+	newinfo = vmalloc(sizeof(struct arpt_table_info)
+			  + SMP_ALIGN(repl->size) * num_possible_cpus());
+	if (!newinfo) {
+		ret = -ENOMEM;
+		return ret;
+	}
+	memcpy(newinfo->entries, repl->entries, repl->size);
+
+	ret = translate_table(table->name, table->valid_hooks,
+			      newinfo, repl->size,
+			      repl->num_entries,
+			      repl->hook_entry,
+			      repl->underflow);
+	duprintf("arpt_register_table: translate table gives %d\n", ret);
+	if (ret != 0) {
+		vfree(newinfo);
+		return ret;
+	}
+
+	ret = down_interruptible(&arpt_mutex);
+	if (ret != 0) {
+		vfree(newinfo);
+		return ret;
+	}
+
+	/* Don't autoload: we'd eat our tail... */
+	if (list_named_find(&arpt_tables, table->name)) {
+		ret = -EEXIST;
+		goto free_unlock;
+	}
+
+	/* Simplifies replace_table code. */
+	table->private = &bootstrap;
+	if (!replace_table(table, 0, newinfo, &ret))
+		goto free_unlock;
+
+	duprintf("table->private->number = %u\n",
+		 table->private->number);
+	
+	/* save number of initial entries */
+	table->private->initial_entries = table->private->number;
+
+	rwlock_init(&table->lock);
+	list_prepend(&arpt_tables, table);
+
+ unlock:
+	up(&arpt_mutex);
+	return ret;
+
+ free_unlock:
+	vfree(newinfo);
+	goto unlock;
+}
+
+void arpt_unregister_table(struct arpt_table *table)
+{
+	down(&arpt_mutex);
+	LIST_DELETE(&arpt_tables, table);
+	up(&arpt_mutex);
+
+	/* Decrease module usage counts and free resources */
+	ARPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+			   cleanup_entry, NULL);
+	vfree(table->private);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct arpt_target arpt_standard_target = {
+	.name		= ARPT_STANDARD_TARGET,
+};
+
+static struct arpt_target arpt_error_target = {
+	.name		= ARPT_ERROR_TARGET,
+	.target		= arpt_error,
+};
+
+static struct nf_sockopt_ops arpt_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= ARPT_BASE_CTL,
+	.set_optmax	= ARPT_SO_SET_MAX+1,
+	.set		= do_arpt_set_ctl,
+	.get_optmin	= ARPT_BASE_CTL,
+	.get_optmax	= ARPT_SO_GET_MAX+1,
+	.get		= do_arpt_get_ctl,
+};
+
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const struct arpt_table *t,
+			     off_t start_offset, char *buffer, int length,
+			     off_t *pos, unsigned int *count)
+{
+	if ((*count)++ >= start_offset) {
+		unsigned int namelen;
+
+		namelen = sprintf(buffer + *pos, "%s\n", t->name);
+		if (*pos + namelen > length) {
+			/* Stop iterating */
+			return 1;
+		}
+		*pos += namelen;
+	}
+	return 0;
+}
+
+static int arpt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+	off_t pos = 0;
+	unsigned int count = 0;
+
+	if (down_interruptible(&arpt_mutex) != 0)
+		return 0;
+
+	LIST_FIND(&arpt_tables, print_name, struct arpt_table *,
+		  offset, buffer, length, &pos, &count);
+
+	up(&arpt_mutex);
+
+	/* `start' hack - see fs/proc/generic.c line ~105 */
+	*start=(char *)((unsigned long)count-offset);
+	return pos;
+}
+#endif /*CONFIG_PROC_FS*/
+
+static int __init init(void)
+{
+	int ret;
+
+	/* Noone else will be downing sem now, so we won't sleep */
+	down(&arpt_mutex);
+	list_append(&arpt_target, &arpt_standard_target);
+	list_append(&arpt_target, &arpt_error_target);
+	up(&arpt_mutex);
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&arpt_sockopts);
+	if (ret < 0) {
+		duprintf("Unable to register sockopts.\n");
+		return ret;
+	}
+
+#ifdef CONFIG_PROC_FS
+	{
+		struct proc_dir_entry *proc;
+
+		proc = proc_net_create("arp_tables_names", 0, arpt_get_tables);
+		if (!proc) {
+			nf_unregister_sockopt(&arpt_sockopts);
+			return -ENOMEM;
+		}
+		proc->owner = THIS_MODULE;
+	}
+#endif
+
+	printk("arp_tables: (C) 2002 David S. Miller\n");
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	nf_unregister_sockopt(&arpt_sockopts);
+#ifdef CONFIG_PROC_FS
+	proc_net_remove("arp_tables_names");
+#endif
+}
+
+EXPORT_SYMBOL(arpt_register_table);
+EXPORT_SYMBOL(arpt_unregister_table);
+EXPORT_SYMBOL(arpt_do_table);
+EXPORT_SYMBOL(arpt_register_target);
+EXPORT_SYMBOL(arpt_unregister_target);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
new file mode 100644
index 000000000000..3e592ec86482
--- /dev/null
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -0,0 +1,104 @@
+/* module that allows mangling of the arp payload */
+#include <linux/module.h>
+#include <linux/netfilter_arp/arpt_mangle.h>
+#include <net/sock.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("arptables arp payload mangle target");
+
+static unsigned int
+target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in,
+   const struct net_device *out, const void *targinfo, void *userinfo)
+{
+	const struct arpt_mangle *mangle = targinfo;
+	struct arphdr *arp;
+	unsigned char *arpptr;
+	int pln, hln;
+
+	if (skb_shared(*pskb) || skb_cloned(*pskb)) {
+		struct sk_buff *nskb;
+
+		nskb = skb_copy(*pskb, GFP_ATOMIC);
+		if (!nskb)
+			return NF_DROP;
+		if ((*pskb)->sk)
+			skb_set_owner_w(nskb, (*pskb)->sk);
+		kfree_skb(*pskb);
+		*pskb = nskb;
+	}
+
+	arp = (*pskb)->nh.arph;
+	arpptr = (*pskb)->nh.raw + sizeof(*arp);
+	pln = arp->ar_pln;
+	hln = arp->ar_hln;
+	/* We assume that pln and hln were checked in the match */
+	if (mangle->flags & ARPT_MANGLE_SDEV) {
+		if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+		   (arpptr + hln > (**pskb).tail))
+			return NF_DROP;
+		memcpy(arpptr, mangle->src_devaddr, hln);
+	}
+	arpptr += hln;
+	if (mangle->flags & ARPT_MANGLE_SIP) {
+		if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+		   (arpptr + pln > (**pskb).tail))
+			return NF_DROP;
+		memcpy(arpptr, &mangle->u_s.src_ip, pln);
+	}
+	arpptr += pln;
+	if (mangle->flags & ARPT_MANGLE_TDEV) {
+		if (ARPT_DEV_ADDR_LEN_MAX < hln ||
+		   (arpptr + hln > (**pskb).tail))
+			return NF_DROP;
+		memcpy(arpptr, mangle->tgt_devaddr, hln);
+	}
+	arpptr += hln;
+	if (mangle->flags & ARPT_MANGLE_TIP) {
+		if (ARPT_MANGLE_ADDR_LEN_MAX < pln ||
+		   (arpptr + pln > (**pskb).tail))
+			return NF_DROP;
+		memcpy(arpptr, &mangle->u_t.tgt_ip, pln);
+	}
+	return mangle->target;
+}
+
+static int
+checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo,
+   unsigned int targinfosize, unsigned int hook_mask)
+{
+	const struct arpt_mangle *mangle = targinfo;
+
+	if (mangle->flags & ~ARPT_MANGLE_MASK ||
+	    !(mangle->flags & ARPT_MANGLE_MASK))
+		return 0;
+
+	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
+	   mangle->target != ARPT_CONTINUE)
+		return 0;
+	return 1;
+}
+
+static struct arpt_target arpt_mangle_reg
+= {
+        .name		= "mangle",
+        .target		= target,
+        .checkentry	= checkentry,
+        .me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	if (arpt_register_target(&arpt_mangle_reg))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	arpt_unregister_target(&arpt_mangle_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
new file mode 100644
index 000000000000..0d759f5a4ef0
--- /dev/null
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -0,0 +1,214 @@
+/*
+ * Filtering ARP tables module.
+ *
+ * Copyright (C) 2002 David S. Miller (davem@redhat.com)
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_arp/arp_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
+MODULE_DESCRIPTION("arptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
+			   (1 << NF_ARP_FORWARD))
+
+/* Standard entry. */
+struct arpt_standard
+{
+	struct arpt_entry entry;
+	struct arpt_standard_target target;
+};
+
+struct arpt_error_target
+{
+	struct arpt_entry_target target;
+	char errorname[ARPT_FUNCTION_MAXNAMELEN];
+};
+
+struct arpt_error
+{
+	struct arpt_entry entry;
+	struct arpt_error_target target;
+};
+
+static struct
+{
+	struct arpt_replace repl;
+	struct arpt_standard entries[3];
+	struct arpt_error term;
+} initial_table __initdata
+= { { "filter", FILTER_VALID_HOOKS, 4,
+      sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
+      { [NF_ARP_IN] = 0,
+	[NF_ARP_OUT] = sizeof(struct arpt_standard),
+	[NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
+      { [NF_ARP_IN] = 0,
+	[NF_ARP_OUT] = sizeof(struct arpt_standard),
+	[NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), },
+      0, NULL, { } },
+    {
+	    /* ARP_IN */
+	    {
+		    {
+			    {
+				    { 0 }, { 0 }, { 0 }, { 0 },
+				    0, 0,
+				    { { 0, }, { 0, } },
+				    { { 0, }, { 0, } },
+				    0, 0,
+				    0, 0,
+				    0, 0,
+				    "", "", { 0 }, { 0 },
+				    0, 0
+			    },
+			    sizeof(struct arpt_entry),
+			    sizeof(struct arpt_standard),
+			    0,
+			    { 0, 0 }, { } },
+		    { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+		      -NF_ACCEPT - 1 }
+	    },
+	    /* ARP_OUT */
+	    {
+		    {
+			    {
+				    { 0 }, { 0 }, { 0 }, { 0 },
+				    0, 0,
+				    { { 0, }, { 0, } },
+				    { { 0, }, { 0, } },
+				    0, 0,
+				    0, 0,
+				    0, 0,
+				    "", "", { 0 }, { 0 },
+				    0, 0
+			    },
+			    sizeof(struct arpt_entry),
+			    sizeof(struct arpt_standard),
+			    0,
+			    { 0, 0 }, { } },
+		    { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+		      -NF_ACCEPT - 1 }
+	    },
+	    /* ARP_FORWARD */
+	    {
+		    {
+			    {
+				    { 0 }, { 0 }, { 0 }, { 0 },
+				    0, 0,
+				    { { 0, }, { 0, } },
+				    { { 0, }, { 0, } },
+				    0, 0,
+				    0, 0,
+				    0, 0,
+				    "", "", { 0 }, { 0 },
+				    0, 0
+			    },
+			    sizeof(struct arpt_entry),
+			    sizeof(struct arpt_standard),
+			    0,
+			    { 0, 0 }, { } },
+		    { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } },
+		      -NF_ACCEPT - 1 }
+	    }
+    },
+    /* ERROR */
+    {
+	    {
+		    {
+			    { 0 }, { 0 }, { 0 }, { 0 },
+			    0, 0,
+			    { { 0, }, { 0, } },
+			    { { 0, }, { 0, } },
+			    0, 0,
+			    0, 0,
+			    0, 0,
+			    "", "", { 0 }, { 0 },
+			    0, 0
+		    },
+		    sizeof(struct arpt_entry),
+		    sizeof(struct arpt_error),
+		    0,
+		    { 0, 0 }, { } },
+	    { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } },
+		{ } },
+	      "ERROR"
+	    }
+    }
+};
+
+static struct arpt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.lock		= RW_LOCK_UNLOCKED,
+	.private	= NULL,
+	.me		= THIS_MODULE,
+};
+
+/* The work comes in here from netfilter.c */
+static unsigned int arpt_hook(unsigned int hook,
+			      struct sk_buff **pskb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static struct nf_hook_ops arpt_ops[] = {
+	{
+		.hook		= arpt_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NF_ARP,
+		.hooknum	= NF_ARP_IN,
+	},
+	{
+		.hook		= arpt_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NF_ARP,
+		.hooknum	= NF_ARP_OUT,
+	},
+	{
+		.hook		= arpt_hook,
+		.owner		= THIS_MODULE,
+		.pf		= NF_ARP,
+		.hooknum	= NF_ARP_FORWARD,
+	},
+};
+
+static int __init init(void)
+{
+	int ret, i;
+
+	/* Register table */
+	ret = arpt_register_table(&packet_filter, &initial_table.repl);
+	if (ret < 0)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
+		if ((ret = nf_register_hook(&arpt_ops[i])) < 0)
+			goto cleanup_hooks;
+	return ret;
+
+cleanup_hooks:
+	while (--i >= 0)
+		nf_unregister_hook(&arpt_ops[i]);
+
+	arpt_unregister_table(&packet_filter);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(arpt_ops); i++)
+		nf_unregister_hook(&arpt_ops[i]);
+
+	arpt_unregister_table(&packet_filter);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c
new file mode 100644
index 000000000000..3dbddd062605
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_amanda.c
@@ -0,0 +1,167 @@
+/* Amanda extension for IP connection tracking, Version 0.2
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on HW's ip_conntrack_irc.c as well as other modules
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Module load syntax:
+ * 	insmod ip_conntrack_amanda.o [master_timeout=n]
+ *	
+ *	Where master_timeout is the timeout (in seconds) of the master
+ *	connection (port 10080).  This defaults to 5 minutes but if
+ *	your clients take longer than 5 minutes to do their work
+ *	before getting back to the Amanda server, you can increase
+ *	this value.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/moduleparam.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
+
+static unsigned int master_timeout = 300;
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda connection tracking module");
+MODULE_LICENSE("GPL");
+module_param(master_timeout, int, 0600);
+MODULE_PARM_DESC(master_timeout, "timeout for the master connection");
+
+static char *conns[] = { "DATA ", "MESG ", "INDEX " };
+
+/* This is slow, but it's simple. --RR */
+static char amanda_buffer[65536];
+static DECLARE_LOCK(amanda_buffer_lock);
+
+unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb,
+				   enum ip_conntrack_info ctinfo,
+				   unsigned int matchoff,
+				   unsigned int matchlen,
+				   struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_amanda_hook);
+
+static int help(struct sk_buff **pskb,
+                struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+	struct ip_conntrack_expect *exp;
+	char *data, *data_limit, *tmp;
+	unsigned int dataoff, i;
+	u_int16_t port, len;
+	int ret = NF_ACCEPT;
+
+	/* Only look at packets from the Amanda server */
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	/* increase the UDP timeout of the master connection as replies from
+	 * Amanda clients to the server can be quite delayed */
+	ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ);
+
+	/* No data? */
+	dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr);
+	if (dataoff >= (*pskb)->len) {
+		if (net_ratelimit())
+			printk("amanda_help: skblen = %u\n", (*pskb)->len);
+		return NF_ACCEPT;
+	}
+
+	LOCK_BH(&amanda_buffer_lock);
+	skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff);
+	data = amanda_buffer;
+	data_limit = amanda_buffer + (*pskb)->len - dataoff;
+	*data_limit = '\0';
+
+	/* Search for the CONNECT string */
+	data = strstr(data, "CONNECT ");
+	if (!data)
+		goto out;
+	data += strlen("CONNECT ");
+
+	/* Only search first line. */	
+	if ((tmp = strchr(data, '\n')))
+		*tmp = '\0';
+
+	for (i = 0; i < ARRAY_SIZE(conns); i++) {
+		char *match = strstr(data, conns[i]);
+		if (!match)
+			continue;
+		tmp = data = match + strlen(conns[i]);
+		port = simple_strtoul(data, &data, 10);
+		len = data - tmp;
+		if (port == 0 || len > 5)
+			break;
+
+		exp = ip_conntrack_expect_alloc();
+		if (exp == NULL) {
+			ret = NF_DROP;
+			goto out;
+		}
+
+		exp->expectfn = NULL;
+		exp->master = ct;
+
+		exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip;
+		exp->tuple.src.u.tcp.port = 0;
+		exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip;
+		exp->tuple.dst.protonum = IPPROTO_TCP;
+		exp->tuple.dst.u.tcp.port = htons(port);
+
+		exp->mask.src.ip = 0xFFFFFFFF;
+		exp->mask.src.u.tcp.port = 0;
+		exp->mask.dst.ip = 0xFFFFFFFF;
+		exp->mask.dst.protonum = 0xFF;
+		exp->mask.dst.u.tcp.port = 0xFFFF;
+
+		if (ip_nat_amanda_hook)
+			ret = ip_nat_amanda_hook(pskb, ctinfo,
+						 tmp - amanda_buffer,
+						 len, exp);
+		else if (ip_conntrack_expect_related(exp) != 0) {
+			ip_conntrack_expect_free(exp);
+			ret = NF_DROP;
+		}
+	}
+
+out:
+	UNLOCK_BH(&amanda_buffer_lock);
+	return ret;
+}
+
+static struct ip_conntrack_helper amanda_helper = {
+	.max_expected = ARRAY_SIZE(conns),
+	.timeout = 180,
+	.me = THIS_MODULE,
+	.help = help,
+	.name = "amanda",
+
+	.tuple = { .src = { .u = { __constant_htons(10080) } },
+		   .dst = { .protonum = IPPROTO_UDP },
+	},
+	.mask = { .src = { .u = { 0xFFFF } },
+		 .dst = { .protonum = 0xFF },
+	},
+};
+
+static void __exit fini(void)
+{
+	ip_conntrack_helper_unregister(&amanda_helper);
+}
+
+static int __init init(void)
+{
+	return ip_conntrack_helper_register(&amanda_helper);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
new file mode 100644
index 000000000000..28d9425d5c39
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -0,0 +1,1247 @@
+/* Connection state tracking for netfilter.  This is separated from,
+   but required by, the NAT layer; it can also be used by an iptables
+   extension. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell  
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ * 	- new API and handling of conntrack/nat helpers
+ * 	- now capable of multiple expectations for one master
+ * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
+ * 	- add usage/reference counts to ip_conntrack_expect
+ *	- export ip_conntrack[_expect]_{find_get,put} functions
+ * */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <linux/stddef.h>
+#include <linux/sysctl.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/moduleparam.h>
+
+/* This rwlock protects the main hash table, protocol/helper/expected
+   registrations, conntrack timers*/
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#define IP_CONNTRACK_VERSION	"2.1"
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_conntrack_lock);
+
+/* ip_conntrack_standalone needs this */
+atomic_t ip_conntrack_count = ATOMIC_INIT(0);
+
+void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
+LIST_HEAD(ip_conntrack_expect_list);
+struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
+static LIST_HEAD(helpers);
+unsigned int ip_conntrack_htable_size = 0;
+int ip_conntrack_max;
+struct list_head *ip_conntrack_hash;
+static kmem_cache_t *ip_conntrack_cachep;
+static kmem_cache_t *ip_conntrack_expect_cachep;
+struct ip_conntrack ip_conntrack_untracked;
+unsigned int ip_ct_log_invalid;
+static LIST_HEAD(unconfirmed);
+static int ip_conntrack_vmalloc;
+
+DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
+void 
+ip_conntrack_put(struct ip_conntrack *ct)
+{
+	IP_NF_ASSERT(ct);
+	nf_conntrack_put(&ct->ct_general);
+}
+
+static int ip_conntrack_hash_rnd_initted;
+static unsigned int ip_conntrack_hash_rnd;
+
+static u_int32_t
+hash_conntrack(const struct ip_conntrack_tuple *tuple)
+{
+#if 0
+	dump_tuple(tuple);
+#endif
+	return (jhash_3words(tuple->src.ip,
+	                     (tuple->dst.ip ^ tuple->dst.protonum),
+	                     (tuple->src.u.all | (tuple->dst.u.all << 16)),
+	                     ip_conntrack_hash_rnd) % ip_conntrack_htable_size);
+}
+
+int
+ip_ct_get_tuple(const struct iphdr *iph,
+		const struct sk_buff *skb,
+		unsigned int dataoff,
+		struct ip_conntrack_tuple *tuple,
+		const struct ip_conntrack_protocol *protocol)
+{
+	/* Never happen */
+	if (iph->frag_off & htons(IP_OFFSET)) {
+		printk("ip_conntrack_core: Frag of proto %u.\n",
+		       iph->protocol);
+		return 0;
+	}
+
+	tuple->src.ip = iph->saddr;
+	tuple->dst.ip = iph->daddr;
+	tuple->dst.protonum = iph->protocol;
+	tuple->dst.dir = IP_CT_DIR_ORIGINAL;
+
+	return protocol->pkt_to_tuple(skb, dataoff, tuple);
+}
+
+int
+ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
+		   const struct ip_conntrack_tuple *orig,
+		   const struct ip_conntrack_protocol *protocol)
+{
+	inverse->src.ip = orig->dst.ip;
+	inverse->dst.ip = orig->src.ip;
+	inverse->dst.protonum = orig->dst.protonum;
+	inverse->dst.dir = !orig->dst.dir;
+
+	return protocol->invert_tuple(inverse, orig);
+}
+
+
+/* ip_conntrack_expect helper functions */
+static void destroy_expect(struct ip_conntrack_expect *exp)
+{
+	ip_conntrack_put(exp->master);
+	IP_NF_ASSERT(!timer_pending(&exp->timeout));
+	kmem_cache_free(ip_conntrack_expect_cachep, exp);
+	CONNTRACK_STAT_INC(expect_delete);
+}
+
+static void unlink_expect(struct ip_conntrack_expect *exp)
+{
+	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+	list_del(&exp->list);
+	/* Logically in destroy_expect, but we hold the lock here. */
+	exp->master->expecting--;
+}
+
+static void expectation_timed_out(unsigned long ul_expect)
+{
+	struct ip_conntrack_expect *exp = (void *)ul_expect;
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	unlink_expect(exp);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+	destroy_expect(exp);
+}
+
+/* If an expectation for this connection is found, it gets delete from
+ * global list then returned. */
+static struct ip_conntrack_expect *
+find_expectation(const struct ip_conntrack_tuple *tuple)
+{
+	struct ip_conntrack_expect *i;
+
+	list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+		/* If master is not in hash table yet (ie. packet hasn't left
+		   this machine yet), how can other end know about expected?
+		   Hence these are not the droids you are looking for (if
+		   master ct never got confirmed, we'd hold a reference to it
+		   and weird things would happen to future packets). */
+		if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
+		    && is_confirmed(i->master)
+		    && del_timer(&i->timeout)) {
+			unlink_expect(i);
+			return i;
+		}
+	}
+	return NULL;
+}
+
+/* delete all expectations for this conntrack */
+static void remove_expectations(struct ip_conntrack *ct)
+{
+	struct ip_conntrack_expect *i, *tmp;
+
+	/* Optimization: most connection never expect any others. */
+	if (ct->expecting == 0)
+		return;
+
+	list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
+		if (i->master == ct && del_timer(&i->timeout)) {
+			unlink_expect(i);
+			destroy_expect(i);
+		}
+	}
+}
+
+static void
+clean_from_lists(struct ip_conntrack *ct)
+{
+	unsigned int ho, hr;
+	
+	DEBUGP("clean_from_lists(%p)\n", ct);
+	MUST_BE_WRITE_LOCKED(&ip_conntrack_lock);
+
+	ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+	LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+	LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
+
+	/* Destroy all pending expectations */
+	remove_expectations(ct);
+}
+
+static void
+destroy_conntrack(struct nf_conntrack *nfct)
+{
+	struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
+	struct ip_conntrack_protocol *proto;
+
+	DEBUGP("destroy_conntrack(%p)\n", ct);
+	IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
+	IP_NF_ASSERT(!timer_pending(&ct->timeout));
+
+	/* To make sure we don't get any weird locking issues here:
+	 * destroy_conntrack() MUST NOT be called with a write lock
+	 * to ip_conntrack_lock!!! -HW */
+	proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
+	if (proto && proto->destroy)
+		proto->destroy(ct);
+
+	if (ip_conntrack_destroyed)
+		ip_conntrack_destroyed(ct);
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	/* Expectations will have been removed in clean_from_lists,
+	 * except TFTP can create an expectation on the first packet,
+	 * before connection is in the list, so we need to clean here,
+	 * too. */
+	remove_expectations(ct);
+
+	/* We overload first tuple to link into unconfirmed list. */
+	if (!is_confirmed(ct)) {
+		BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
+		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+	}
+
+	CONNTRACK_STAT_INC(delete);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	if (ct->master)
+		ip_conntrack_put(ct->master);
+
+	DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
+	kmem_cache_free(ip_conntrack_cachep, ct);
+	atomic_dec(&ip_conntrack_count);
+}
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+	struct ip_conntrack *ct = (void *)ul_conntrack;
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	/* Inside lock so preempt is disabled on module removal path.
+	 * Otherwise we can get spurious warnings. */
+	CONNTRACK_STAT_INC(delete_list);
+	clean_from_lists(ct);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+	ip_conntrack_put(ct);
+}
+
+static inline int
+conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
+		    const struct ip_conntrack_tuple *tuple,
+		    const struct ip_conntrack *ignored_conntrack)
+{
+	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+	return tuplehash_to_ctrack(i) != ignored_conntrack
+		&& ip_ct_tuple_equal(tuple, &i->tuple);
+}
+
+static struct ip_conntrack_tuple_hash *
+__ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
+		    const struct ip_conntrack *ignored_conntrack)
+{
+	struct ip_conntrack_tuple_hash *h;
+	unsigned int hash = hash_conntrack(tuple);
+
+	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+	list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
+		if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
+			CONNTRACK_STAT_INC(found);
+			return h;
+		}
+		CONNTRACK_STAT_INC(searched);
+	}
+
+	return NULL;
+}
+
+/* Find a connection corresponding to a tuple. */
+struct ip_conntrack_tuple_hash *
+ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
+		      const struct ip_conntrack *ignored_conntrack)
+{
+	struct ip_conntrack_tuple_hash *h;
+
+	READ_LOCK(&ip_conntrack_lock);
+	h = __ip_conntrack_find(tuple, ignored_conntrack);
+	if (h)
+		atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+	READ_UNLOCK(&ip_conntrack_lock);
+
+	return h;
+}
+
+/* Confirm a connection given skb; places it in hash table */
+int
+__ip_conntrack_confirm(struct sk_buff **pskb)
+{
+	unsigned int hash, repl_hash;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+
+	/* ipt_REJECT uses ip_conntrack_attach to attach related
+	   ICMP/TCP RST packets in other direction.  Actual packet
+	   which created connection will be IP_CT_NEW or for an
+	   expected connection, IP_CT_RELATED. */
+	if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+	repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	/* We're not in hash table, and we refuse to set up related
+	   connections for unconfirmed conns.  But packet copies and
+	   REJECT will give spurious warnings here. */
+	/* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
+
+	/* No external references means noone else could have
+           confirmed us. */
+	IP_NF_ASSERT(!is_confirmed(ct));
+	DEBUGP("Confirming conntrack %p\n", ct);
+
+	WRITE_LOCK(&ip_conntrack_lock);
+
+	/* See if there's one in the list already, including reverse:
+           NAT could have grabbed it without realizing, since we're
+           not in the hash.  If there is, we lost race. */
+	if (!LIST_FIND(&ip_conntrack_hash[hash],
+		       conntrack_tuple_cmp,
+		       struct ip_conntrack_tuple_hash *,
+		       &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
+	    && !LIST_FIND(&ip_conntrack_hash[repl_hash],
+			  conntrack_tuple_cmp,
+			  struct ip_conntrack_tuple_hash *,
+			  &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
+		/* Remove from unconfirmed list */
+		list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
+
+		list_prepend(&ip_conntrack_hash[hash],
+			     &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
+		list_prepend(&ip_conntrack_hash[repl_hash],
+			     &ct->tuplehash[IP_CT_DIR_REPLY]);
+		/* Timer relative to confirmation time, not original
+		   setting time, otherwise we'd get timer wrap in
+		   weird delay cases. */
+		ct->timeout.expires += jiffies;
+		add_timer(&ct->timeout);
+		atomic_inc(&ct->ct_general.use);
+		set_bit(IPS_CONFIRMED_BIT, &ct->status);
+		CONNTRACK_STAT_INC(insert);
+		WRITE_UNLOCK(&ip_conntrack_lock);
+		return NF_ACCEPT;
+	}
+
+	CONNTRACK_STAT_INC(insert_failed);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	return NF_DROP;
+}
+
+/* Returns true if a connection correspondings to the tuple (required
+   for NAT). */
+int
+ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
+			 const struct ip_conntrack *ignored_conntrack)
+{
+	struct ip_conntrack_tuple_hash *h;
+
+	READ_LOCK(&ip_conntrack_lock);
+	h = __ip_conntrack_find(tuple, ignored_conntrack);
+	READ_UNLOCK(&ip_conntrack_lock);
+
+	return h != NULL;
+}
+
+/* There's a small race here where we may free a just-assured
+   connection.  Too bad: we're in trouble anyway. */
+static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
+{
+	return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
+}
+
+static int early_drop(struct list_head *chain)
+{
+	/* Traverse backwards: gives us oldest, which is roughly LRU */
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct = NULL;
+	int dropped = 0;
+
+	READ_LOCK(&ip_conntrack_lock);
+	h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
+	if (h) {
+		ct = tuplehash_to_ctrack(h);
+		atomic_inc(&ct->ct_general.use);
+	}
+	READ_UNLOCK(&ip_conntrack_lock);
+
+	if (!ct)
+		return dropped;
+
+	if (del_timer(&ct->timeout)) {
+		death_by_timeout((unsigned long)ct);
+		dropped = 1;
+		CONNTRACK_STAT_INC(early_drop);
+	}
+	ip_conntrack_put(ct);
+	return dropped;
+}
+
+static inline int helper_cmp(const struct ip_conntrack_helper *i,
+			     const struct ip_conntrack_tuple *rtuple)
+{
+	return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
+}
+
+static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple)
+{
+	return LIST_FIND(&helpers, helper_cmp,
+			 struct ip_conntrack_helper *,
+			 tuple);
+}
+
+/* Allocate a new conntrack: we return -ENOMEM if classification
+   failed due to stress.  Otherwise it really is unclassifiable. */
+static struct ip_conntrack_tuple_hash *
+init_conntrack(const struct ip_conntrack_tuple *tuple,
+	       struct ip_conntrack_protocol *protocol,
+	       struct sk_buff *skb)
+{
+	struct ip_conntrack *conntrack;
+	struct ip_conntrack_tuple repl_tuple;
+	size_t hash;
+	struct ip_conntrack_expect *exp;
+
+	if (!ip_conntrack_hash_rnd_initted) {
+		get_random_bytes(&ip_conntrack_hash_rnd, 4);
+		ip_conntrack_hash_rnd_initted = 1;
+	}
+
+	hash = hash_conntrack(tuple);
+
+	if (ip_conntrack_max
+	    && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
+		/* Try dropping from this hash chain. */
+		if (!early_drop(&ip_conntrack_hash[hash])) {
+			if (net_ratelimit())
+				printk(KERN_WARNING
+				       "ip_conntrack: table full, dropping"
+				       " packet.\n");
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+
+	if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
+		DEBUGP("Can't invert tuple.\n");
+		return NULL;
+	}
+
+	conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
+	if (!conntrack) {
+		DEBUGP("Can't allocate conntrack.\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memset(conntrack, 0, sizeof(*conntrack));
+	atomic_set(&conntrack->ct_general.use, 1);
+	conntrack->ct_general.destroy = destroy_conntrack;
+	conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple;
+	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple;
+	if (!protocol->new(conntrack, skb)) {
+		kmem_cache_free(ip_conntrack_cachep, conntrack);
+		return NULL;
+	}
+	/* Don't set timer yet: wait for confirmation */
+	init_timer(&conntrack->timeout);
+	conntrack->timeout.data = (unsigned long)conntrack;
+	conntrack->timeout.function = death_by_timeout;
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	exp = find_expectation(tuple);
+
+	if (exp) {
+		DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
+			conntrack, exp);
+		/* Welcome, Mr. Bond.  We've been expecting you... */
+		__set_bit(IPS_EXPECTED_BIT, &conntrack->status);
+		conntrack->master = exp->master;
+#if CONFIG_IP_NF_CONNTRACK_MARK
+		conntrack->mark = exp->master->mark;
+#endif
+		nf_conntrack_get(&conntrack->master->ct_general);
+		CONNTRACK_STAT_INC(expect_new);
+	} else {
+		conntrack->helper = ip_ct_find_helper(&repl_tuple);
+
+		CONNTRACK_STAT_INC(new);
+	}
+
+	/* Overload tuple linked list to put us in unconfirmed list. */
+	list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
+
+	atomic_inc(&ip_conntrack_count);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	if (exp) {
+		if (exp->expectfn)
+			exp->expectfn(conntrack, exp);
+		destroy_expect(exp);
+	}
+
+	return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
+}
+
+/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
+static inline struct ip_conntrack *
+resolve_normal_ct(struct sk_buff *skb,
+		  struct ip_conntrack_protocol *proto,
+		  int *set_reply,
+		  unsigned int hooknum,
+		  enum ip_conntrack_info *ctinfo)
+{
+	struct ip_conntrack_tuple tuple;
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack *ct;
+
+	IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
+
+	if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
+				&tuple,proto))
+		return NULL;
+
+	/* look for tuple match */
+	h = ip_conntrack_find_get(&tuple, NULL);
+	if (!h) {
+		h = init_conntrack(&tuple, proto, skb);
+		if (!h)
+			return NULL;
+		if (IS_ERR(h))
+			return (void *)h;
+	}
+	ct = tuplehash_to_ctrack(h);
+
+	/* It exists; we have (non-exclusive) reference. */
+	if (DIRECTION(h) == IP_CT_DIR_REPLY) {
+		*ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
+		/* Please set reply bit if this packet OK */
+		*set_reply = 1;
+	} else {
+		/* Once we've had two way comms, always ESTABLISHED. */
+		if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
+			DEBUGP("ip_conntrack_in: normal packet for %p\n",
+			       ct);
+		        *ctinfo = IP_CT_ESTABLISHED;
+		} else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
+			DEBUGP("ip_conntrack_in: related packet for %p\n",
+			       ct);
+			*ctinfo = IP_CT_RELATED;
+		} else {
+			DEBUGP("ip_conntrack_in: new packet for %p\n",
+			       ct);
+			*ctinfo = IP_CT_NEW;
+		}
+		*set_reply = 0;
+	}
+	skb->nfct = &ct->ct_general;
+	skb->nfctinfo = *ctinfo;
+	return ct;
+}
+
+/* Netfilter hook itself. */
+unsigned int ip_conntrack_in(unsigned int hooknum,
+			     struct sk_buff **pskb,
+			     const struct net_device *in,
+			     const struct net_device *out,
+			     int (*okfn)(struct sk_buff *))
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack_protocol *proto;
+	int set_reply;
+	int ret;
+
+	/* Previously seen (loopback or untracked)?  Ignore. */
+	if ((*pskb)->nfct) {
+		CONNTRACK_STAT_INC(ignore);
+		return NF_ACCEPT;
+	}
+
+	/* Never happen */
+	if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
+		if (net_ratelimit()) {
+		printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
+		       (*pskb)->nh.iph->protocol, hooknum);
+		}
+		return NF_DROP;
+	}
+
+	/* FIXME: Do this right please. --RR */
+	(*pskb)->nfcache |= NFC_UNKNOWN;
+
+/* Doesn't cover locally-generated broadcast, so not worth it. */
+#if 0
+	/* Ignore broadcast: no `connection'. */
+	if ((*pskb)->pkt_type == PACKET_BROADCAST) {
+		printk("Broadcast packet!\n");
+		return NF_ACCEPT;
+	} else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
+		   == htonl(0x000000FF)) {
+		printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
+		       NIPQUAD((*pskb)->nh.iph->saddr),
+		       NIPQUAD((*pskb)->nh.iph->daddr),
+		       (*pskb)->sk, (*pskb)->pkt_type);
+	}
+#endif
+
+	proto = ip_ct_find_proto((*pskb)->nh.iph->protocol);
+
+	/* It may be an special packet, error, unclean...
+	 * inverse of the return code tells to the netfilter
+	 * core what to do with the packet. */
+	if (proto->error != NULL 
+	    && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
+		CONNTRACK_STAT_INC(error);
+		CONNTRACK_STAT_INC(invalid);
+		return -ret;
+	}
+
+	if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
+		/* Not valid part of a connection */
+		CONNTRACK_STAT_INC(invalid);
+		return NF_ACCEPT;
+	}
+
+	if (IS_ERR(ct)) {
+		/* Too stressed to deal. */
+		CONNTRACK_STAT_INC(drop);
+		return NF_DROP;
+	}
+
+	IP_NF_ASSERT((*pskb)->nfct);
+
+	ret = proto->packet(ct, *pskb, ctinfo);
+	if (ret < 0) {
+		/* Invalid: inverse of the return code tells
+		 * the netfilter core what to do*/
+		nf_conntrack_put((*pskb)->nfct);
+		(*pskb)->nfct = NULL;
+		CONNTRACK_STAT_INC(invalid);
+		return -ret;
+	}
+
+	if (set_reply)
+		set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+
+	return ret;
+}
+
+int invert_tuplepr(struct ip_conntrack_tuple *inverse,
+		   const struct ip_conntrack_tuple *orig)
+{
+	return ip_ct_invert_tuple(inverse, orig, 
+				  ip_ct_find_proto(orig->dst.protonum));
+}
+
+/* Would two expected things clash? */
+static inline int expect_clash(const struct ip_conntrack_expect *a,
+			       const struct ip_conntrack_expect *b)
+{
+	/* Part covered by intersection of masks must be unequal,
+           otherwise they clash */
+	struct ip_conntrack_tuple intersect_mask
+		= { { a->mask.src.ip & b->mask.src.ip,
+		      { a->mask.src.u.all & b->mask.src.u.all } },
+		    { a->mask.dst.ip & b->mask.dst.ip,
+		      { a->mask.dst.u.all & b->mask.dst.u.all },
+		      a->mask.dst.protonum & b->mask.dst.protonum } };
+
+	return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
+}
+
+static inline int expect_matches(const struct ip_conntrack_expect *a,
+				 const struct ip_conntrack_expect *b)
+{
+	return a->master == b->master
+		&& ip_ct_tuple_equal(&a->tuple, &b->tuple)
+		&& ip_ct_tuple_equal(&a->mask, &b->mask);
+}
+
+/* Generally a bad idea to call this: could have matched already. */
+void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
+{
+	struct ip_conntrack_expect *i;
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	/* choose the the oldest expectation to evict */
+	list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+		if (expect_matches(i, exp) && del_timer(&i->timeout)) {
+			unlink_expect(i);
+			WRITE_UNLOCK(&ip_conntrack_lock);
+			destroy_expect(i);
+			return;
+		}
+	}
+	WRITE_UNLOCK(&ip_conntrack_lock);
+}
+
+struct ip_conntrack_expect *ip_conntrack_expect_alloc(void)
+{
+	struct ip_conntrack_expect *new;
+
+	new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
+	if (!new) {
+		DEBUGP("expect_related: OOM allocating expect\n");
+		return NULL;
+	}
+	new->master = NULL;
+	return new;
+}
+
+void ip_conntrack_expect_free(struct ip_conntrack_expect *expect)
+{
+	kmem_cache_free(ip_conntrack_expect_cachep, expect);
+}
+
+static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
+{
+	atomic_inc(&exp->master->ct_general.use);
+	exp->master->expecting++;
+	list_add(&exp->list, &ip_conntrack_expect_list);
+
+	if (exp->master->helper->timeout) {
+		init_timer(&exp->timeout);
+		exp->timeout.data = (unsigned long)exp;
+		exp->timeout.function = expectation_timed_out;
+		exp->timeout.expires
+			= jiffies + exp->master->helper->timeout * HZ;
+		add_timer(&exp->timeout);
+	} else
+		exp->timeout.function = NULL;
+
+	CONNTRACK_STAT_INC(expect_create);
+}
+
+/* Race with expectations being used means we could have none to find; OK. */
+static void evict_oldest_expect(struct ip_conntrack *master)
+{
+	struct ip_conntrack_expect *i;
+
+	list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
+		if (i->master == master) {
+			if (del_timer(&i->timeout)) {
+				unlink_expect(i);
+				destroy_expect(i);
+			}
+			break;
+		}
+	}
+}
+
+static inline int refresh_timer(struct ip_conntrack_expect *i)
+{
+	if (!del_timer(&i->timeout))
+		return 0;
+
+	i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
+	add_timer(&i->timeout);
+	return 1;
+}
+
+int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
+{
+	struct ip_conntrack_expect *i;
+	int ret;
+
+	DEBUGP("ip_conntrack_expect_related %p\n", related_to);
+	DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
+	DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	list_for_each_entry(i, &ip_conntrack_expect_list, list) {
+		if (expect_matches(i, expect)) {
+			/* Refresh timer: if it's dying, ignore.. */
+			if (refresh_timer(i)) {
+				ret = 0;
+				/* We don't need the one they've given us. */
+				ip_conntrack_expect_free(expect);
+				goto out;
+			}
+		} else if (expect_clash(i, expect)) {
+			ret = -EBUSY;
+			goto out;
+		}
+	}
+
+	/* Will be over limit? */
+	if (expect->master->helper->max_expected && 
+	    expect->master->expecting >= expect->master->helper->max_expected)
+		evict_oldest_expect(expect->master);
+
+	ip_conntrack_expect_insert(expect);
+	ret = 0;
+out:
+	WRITE_UNLOCK(&ip_conntrack_lock);
+ 	return ret;
+}
+
+/* Alter reply tuple (maybe alter helper).  This is for NAT, and is
+   implicitly racy: see __ip_conntrack_confirm */
+void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
+			      const struct ip_conntrack_tuple *newreply)
+{
+	WRITE_LOCK(&ip_conntrack_lock);
+	/* Should be unconfirmed, so not in hash table yet */
+	IP_NF_ASSERT(!is_confirmed(conntrack));
+
+	DEBUGP("Altering reply tuple of %p to ", conntrack);
+	DUMP_TUPLE(newreply);
+
+	conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
+	if (!conntrack->master && conntrack->expecting == 0)
+		conntrack->helper = ip_ct_find_helper(newreply);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+}
+
+int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
+{
+	BUG_ON(me->timeout == 0);
+	WRITE_LOCK(&ip_conntrack_lock);
+	list_prepend(&helpers, me);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	return 0;
+}
+
+static inline int unhelp(struct ip_conntrack_tuple_hash *i,
+			 const struct ip_conntrack_helper *me)
+{
+	if (tuplehash_to_ctrack(i)->helper == me)
+		tuplehash_to_ctrack(i)->helper = NULL;
+	return 0;
+}
+
+void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
+{
+	unsigned int i;
+	struct ip_conntrack_expect *exp, *tmp;
+
+	/* Need write lock here, to delete helper. */
+	WRITE_LOCK(&ip_conntrack_lock);
+	LIST_DELETE(&helpers, me);
+
+	/* Get rid of expectations */
+	list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
+		if (exp->master->helper == me && del_timer(&exp->timeout)) {
+			unlink_expect(exp);
+			destroy_expect(exp);
+		}
+	}
+	/* Get rid of expecteds, set helpers to NULL. */
+	LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
+	for (i = 0; i < ip_conntrack_htable_size; i++)
+		LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
+			    struct ip_conntrack_tuple_hash *, me);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	/* Someone could be still looking at the helper in a bh. */
+	synchronize_net();
+}
+
+static inline void ct_add_counters(struct ip_conntrack *ct,
+				   enum ip_conntrack_info ctinfo,
+				   const struct sk_buff *skb)
+{
+#ifdef CONFIG_IP_NF_CT_ACCT
+	if (skb) {
+		ct->counters[CTINFO2DIR(ctinfo)].packets++;
+		ct->counters[CTINFO2DIR(ctinfo)].bytes += 
+					ntohs(skb->nh.iph->tot_len);
+	}
+#endif
+}
+
+/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */
+void ip_ct_refresh_acct(struct ip_conntrack *ct, 
+		        enum ip_conntrack_info ctinfo,
+			const struct sk_buff *skb,
+			unsigned long extra_jiffies)
+{
+	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
+
+	/* If not in hash table, timer will not be active yet */
+	if (!is_confirmed(ct)) {
+		ct->timeout.expires = extra_jiffies;
+		ct_add_counters(ct, ctinfo, skb);
+	} else {
+		WRITE_LOCK(&ip_conntrack_lock);
+		/* Need del_timer for race avoidance (may already be dying). */
+		if (del_timer(&ct->timeout)) {
+			ct->timeout.expires = jiffies + extra_jiffies;
+			add_timer(&ct->timeout);
+		}
+		ct_add_counters(ct, ctinfo, skb);
+		WRITE_UNLOCK(&ip_conntrack_lock);
+	}
+}
+
+/* Returns new sk_buff, or NULL */
+struct sk_buff *
+ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
+{
+	struct sock *sk = skb->sk;
+#ifdef CONFIG_NETFILTER_DEBUG
+	unsigned int olddebug = skb->nf_debug;
+#endif
+
+	if (sk) {
+		sock_hold(sk);
+		skb_orphan(skb);
+	}
+
+	local_bh_disable(); 
+	skb = ip_defrag(skb, user);
+	local_bh_enable();
+
+	if (!skb) {
+		if (sk)
+			sock_put(sk);
+		return skb;
+	}
+
+	if (sk) {
+		skb_set_owner_w(skb, sk);
+		sock_put(sk);
+	}
+
+	ip_send_check(skb->nh.iph);
+	skb->nfcache |= NFC_ALTERED;
+#ifdef CONFIG_NETFILTER_DEBUG
+	/* Packet path as if nothing had happened. */
+	skb->nf_debug = olddebug;
+#endif
+	return skb;
+}
+
+/* Used by ipt_REJECT. */
+static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+
+	/* This ICMP is in reverse direction to the packet which caused it */
+	ct = ip_conntrack_get(skb, &ctinfo);
+	
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
+		ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
+	else
+		ctinfo = IP_CT_RELATED;
+
+	/* Attach to new skbuff, and increment count */
+	nskb->nfct = &ct->ct_general;
+	nskb->nfctinfo = ctinfo;
+	nf_conntrack_get(nskb->nfct);
+}
+
+static inline int
+do_iter(const struct ip_conntrack_tuple_hash *i,
+	int (*iter)(struct ip_conntrack *i, void *data),
+	void *data)
+{
+	return iter(tuplehash_to_ctrack(i), data);
+}
+
+/* Bring out ya dead! */
+static struct ip_conntrack_tuple_hash *
+get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
+		void *data, unsigned int *bucket)
+{
+	struct ip_conntrack_tuple_hash *h = NULL;
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
+		h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
+				struct ip_conntrack_tuple_hash *, iter, data);
+		if (h)
+			break;
+	}
+	if (!h)
+		h = LIST_FIND_W(&unconfirmed, do_iter,
+				struct ip_conntrack_tuple_hash *, iter, data);
+	if (h)
+		atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	return h;
+}
+
+void
+ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
+{
+	struct ip_conntrack_tuple_hash *h;
+	unsigned int bucket = 0;
+
+	while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
+		struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+		/* Time to push up daises... */
+		if (del_timer(&ct->timeout))
+			death_by_timeout((unsigned long)ct);
+		/* ... else the timer will get him soon. */
+
+		ip_conntrack_put(ct);
+	}
+}
+
+/* Fast function for those who don't want to parse /proc (and I don't
+   blame them). */
+/* Reversing the socket's dst/src point of view gives us the reply
+   mapping. */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ip_conntrack_tuple_hash *h;
+	struct ip_conntrack_tuple tuple;
+	
+	IP_CT_TUPLE_U_BLANK(&tuple);
+	tuple.src.ip = inet->rcv_saddr;
+	tuple.src.u.tcp.port = inet->sport;
+	tuple.dst.ip = inet->daddr;
+	tuple.dst.u.tcp.port = inet->dport;
+	tuple.dst.protonum = IPPROTO_TCP;
+
+	/* We only do TCP at the moment: is there a better way? */
+	if (strcmp(sk->sk_prot->name, "TCP")) {
+		DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
+		return -ENOPROTOOPT;
+	}
+
+	if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
+		DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
+		       *len, sizeof(struct sockaddr_in));
+		return -EINVAL;
+	}
+
+	h = ip_conntrack_find_get(&tuple, NULL);
+	if (h) {
+		struct sockaddr_in sin;
+		struct ip_conntrack *ct = tuplehash_to_ctrack(h);
+
+		sin.sin_family = AF_INET;
+		sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+			.tuple.dst.u.tcp.port;
+		sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+			.tuple.dst.ip;
+
+		DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
+		       NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
+		ip_conntrack_put(ct);
+		if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+			return -EFAULT;
+		else
+			return 0;
+	}
+	DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
+	       NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
+	       NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
+	return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst = {
+	.pf		= PF_INET,
+	.get_optmin	= SO_ORIGINAL_DST,
+	.get_optmax	= SO_ORIGINAL_DST+1,
+	.get		= &getorigdst,
+};
+
+static int kill_all(struct ip_conntrack *i, void *data)
+{
+	return 1;
+}
+
+static void free_conntrack_hash(void)
+{
+	if (ip_conntrack_vmalloc)
+		vfree(ip_conntrack_hash);
+	else
+		free_pages((unsigned long)ip_conntrack_hash, 
+			   get_order(sizeof(struct list_head)
+				     * ip_conntrack_htable_size));
+}
+
+/* Mishearing the voices in his head, our hero wonders how he's
+   supposed to kill the mall. */
+void ip_conntrack_cleanup(void)
+{
+	ip_ct_attach = NULL;
+	/* This makes sure all current packets have passed through
+           netfilter framework.  Roll on, two-stage module
+           delete... */
+	synchronize_net();
+ 
+ i_see_dead_people:
+	ip_ct_iterate_cleanup(kill_all, NULL);
+	if (atomic_read(&ip_conntrack_count) != 0) {
+		schedule();
+		goto i_see_dead_people;
+	}
+
+	kmem_cache_destroy(ip_conntrack_cachep);
+	kmem_cache_destroy(ip_conntrack_expect_cachep);
+	free_conntrack_hash();
+	nf_unregister_sockopt(&so_getorigdst);
+}
+
+static int hashsize;
+module_param(hashsize, int, 0400);
+
+int __init ip_conntrack_init(void)
+{
+	unsigned int i;
+	int ret;
+
+	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
+	 * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
+ 	if (hashsize) {
+ 		ip_conntrack_htable_size = hashsize;
+ 	} else {
+		ip_conntrack_htable_size
+			= (((num_physpages << PAGE_SHIFT) / 16384)
+			   / sizeof(struct list_head));
+		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+			ip_conntrack_htable_size = 8192;
+		if (ip_conntrack_htable_size < 16)
+			ip_conntrack_htable_size = 16;
+	}
+	ip_conntrack_max = 8 * ip_conntrack_htable_size;
+
+	printk("ip_conntrack version %s (%u buckets, %d max)"
+	       " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
+	       ip_conntrack_htable_size, ip_conntrack_max,
+	       sizeof(struct ip_conntrack));
+
+	ret = nf_register_sockopt(&so_getorigdst);
+	if (ret != 0) {
+		printk(KERN_ERR "Unable to register netfilter socket option\n");
+		return ret;
+	}
+
+	/* AK: the hash table is twice as big than needed because it
+	   uses list_head.  it would be much nicer to caches to use a
+	   single pointer list head here. */
+	ip_conntrack_vmalloc = 0; 
+	ip_conntrack_hash 
+		=(void*)__get_free_pages(GFP_KERNEL, 
+					 get_order(sizeof(struct list_head)
+						   *ip_conntrack_htable_size));
+	if (!ip_conntrack_hash) { 
+		ip_conntrack_vmalloc = 1;
+		printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n");
+		ip_conntrack_hash = vmalloc(sizeof(struct list_head)
+					    * ip_conntrack_htable_size);
+	}
+	if (!ip_conntrack_hash) {
+		printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
+		goto err_unreg_sockopt;
+	}
+
+	ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
+	                                        sizeof(struct ip_conntrack), 0,
+	                                        0, NULL, NULL);
+	if (!ip_conntrack_cachep) {
+		printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
+		goto err_free_hash;
+	}
+
+	ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
+					sizeof(struct ip_conntrack_expect),
+					0, 0, NULL, NULL);
+	if (!ip_conntrack_expect_cachep) {
+		printk(KERN_ERR "Unable to create ip_expect slab cache\n");
+		goto err_free_conntrack_slab;
+	}
+
+	/* Don't NEED lock here, but good form anyway. */
+	WRITE_LOCK(&ip_conntrack_lock);
+	for (i = 0; i < MAX_IP_CT_PROTO; i++)
+		ip_ct_protos[i] = &ip_conntrack_generic_protocol;
+	/* Sew in builtin protocols. */
+	ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
+	ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
+	ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
+	WRITE_UNLOCK(&ip_conntrack_lock);
+
+	for (i = 0; i < ip_conntrack_htable_size; i++)
+		INIT_LIST_HEAD(&ip_conntrack_hash[i]);
+
+	/* For use by ipt_REJECT */
+	ip_ct_attach = ip_conntrack_attach;
+
+	/* Set up fake conntrack:
+	    - to never be deleted, not in any hashes */
+	atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
+	/*  - and look it like as a confirmed connection */
+	set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
+
+	return ret;
+
+err_free_conntrack_slab:
+	kmem_cache_destroy(ip_conntrack_cachep);
+err_free_hash:
+	free_conntrack_hash();
+err_unreg_sockopt:
+	nf_unregister_sockopt(&so_getorigdst);
+
+	return -ENOMEM;
+}
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
new file mode 100644
index 000000000000..12b88cbb11db
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -0,0 +1,501 @@
+/* FTP extension for IP connection tracking. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell  
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/ctype.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/moduleparam.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp connection tracking helper");
+
+/* This is slow, but it's simple. --RR */
+static char ftp_buffer[65536];
+
+static DECLARE_LOCK(ip_ftp_lock);
+
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, int, &ports_c, 0400);
+
+static int loose;
+module_param(loose, int, 0600);
+
+unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				enum ip_ct_ftp_type type,
+				unsigned int matchoff,
+				unsigned int matchlen,
+				struct ip_conntrack_expect *exp,
+				u32 *seq);
+EXPORT_SYMBOL_GPL(ip_nat_ftp_hook);
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int try_rfc959(const char *, size_t, u_int32_t [], char);
+static int try_eprt(const char *, size_t, u_int32_t [], char);
+static int try_epsv_response(const char *, size_t, u_int32_t [], char);
+
+static struct ftp_search {
+	enum ip_conntrack_dir dir;
+	const char *pattern;
+	size_t plen;
+	char skip;
+	char term;
+	enum ip_ct_ftp_type ftptype;
+	int (*getnum)(const char *, size_t, u_int32_t[], char);
+} search[] = {
+	{
+		IP_CT_DIR_ORIGINAL,
+		"PORT",	sizeof("PORT") - 1, ' ', '\r',
+		IP_CT_FTP_PORT,
+		try_rfc959,
+	},
+	{
+		IP_CT_DIR_REPLY,
+		"227 ",	sizeof("227 ") - 1, '(', ')',
+		IP_CT_FTP_PASV,
+		try_rfc959,
+	},
+	{
+		IP_CT_DIR_ORIGINAL,
+		"EPRT", sizeof("EPRT") - 1, ' ', '\r',
+		IP_CT_FTP_EPRT,
+		try_eprt,
+	},
+	{
+		IP_CT_DIR_REPLY,
+		"229 ", sizeof("229 ") - 1, '(', ')',
+		IP_CT_FTP_EPSV,
+		try_epsv_response,
+	},
+};
+
+static int try_number(const char *data, size_t dlen, u_int32_t array[],
+		      int array_size, char sep, char term)
+{
+	u_int32_t i, len;
+
+	memset(array, 0, sizeof(array[0])*array_size);
+
+	/* Keep data pointing at next char. */
+	for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) {
+		if (*data >= '0' && *data <= '9') {
+			array[i] = array[i]*10 + *data - '0';
+		}
+		else if (*data == sep)
+			i++;
+		else {
+			/* Unexpected character; true if it's the
+			   terminator and we're finished. */
+			if (*data == term && i == array_size - 1)
+				return len;
+
+			DEBUGP("Char %u (got %u nums) `%u' unexpected\n",
+			       len, i, *data);
+			return 0;
+		}
+	}
+	DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep);
+
+	return 0;
+}
+
+/* Returns 0, or length of numbers: 192,168,1,1,5,6 */
+static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6],
+		       char term)
+{
+	return try_number(data, dlen, array, 6, ',', term);
+}
+
+/* Grab port: number up to delimiter */
+static int get_port(const char *data, int start, size_t dlen, char delim,
+		    u_int32_t array[2])
+{
+	u_int16_t port = 0;
+	int i;
+
+	for (i = start; i < dlen; i++) {
+		/* Finished? */
+		if (data[i] == delim) {
+			if (port == 0)
+				break;
+			array[0] = port >> 8;
+			array[1] = port;
+			return i + 1;
+		}
+		else if (data[i] >= '0' && data[i] <= '9')
+			port = port*10 + data[i] - '0';
+		else /* Some other crap */
+			break;
+	}
+	return 0;
+}
+
+/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */
+static int try_eprt(const char *data, size_t dlen, u_int32_t array[6],
+		    char term)
+{
+	char delim;
+	int length;
+
+	/* First character is delimiter, then "1" for IPv4, then
+           delimiter again. */
+	if (dlen <= 3) return 0;
+	delim = data[0];
+	if (isdigit(delim) || delim < 33 || delim > 126
+	    || data[1] != '1' || data[2] != delim)
+		return 0;
+
+	DEBUGP("EPRT: Got |1|!\n");
+	/* Now we have IP address. */
+	length = try_number(data + 3, dlen - 3, array, 4, '.', delim);
+	if (length == 0)
+		return 0;
+
+	DEBUGP("EPRT: Got IP address!\n");
+	/* Start offset includes initial "|1|", and trailing delimiter */
+	return get_port(data, 3 + length + 1, dlen, delim, array+4);
+}
+
+/* Returns 0, or length of numbers: |||6446| */
+static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6],
+			     char term)
+{
+	char delim;
+
+	/* Three delimiters. */
+	if (dlen <= 3) return 0;
+	delim = data[0];
+	if (isdigit(delim) || delim < 33 || delim > 126
+	    || data[1] != delim || data[2] != delim)
+		return 0;
+
+	return get_port(data, 3, dlen, delim, array+4);
+}
+
+/* Return 1 for match, 0 for accept, -1 for partial. */
+static int find_pattern(const char *data, size_t dlen,
+			const char *pattern, size_t plen,
+			char skip, char term,
+			unsigned int *numoff,
+			unsigned int *numlen,
+			u_int32_t array[6],
+			int (*getnum)(const char *, size_t, u_int32_t[], char))
+{
+	size_t i;
+
+	DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen);
+	if (dlen == 0)
+		return 0;
+
+	if (dlen <= plen) {
+		/* Short packet: try for partial? */
+		if (strnicmp(data, pattern, dlen) == 0)
+			return -1;
+		else return 0;
+	}
+
+	if (strnicmp(data, pattern, plen) != 0) {
+#if 0
+		size_t i;
+
+		DEBUGP("ftp: string mismatch\n");
+		for (i = 0; i < plen; i++) {
+			DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n",
+				i, data[i], data[i],
+				pattern[i], pattern[i]);
+		}
+#endif
+		return 0;
+	}
+
+	DEBUGP("Pattern matches!\n");
+	/* Now we've found the constant string, try to skip
+	   to the 'skip' character */
+	for (i = plen; data[i] != skip; i++)
+		if (i == dlen - 1) return -1;
+
+	/* Skip over the last character */
+	i++;
+
+	DEBUGP("Skipped up to `%c'!\n", skip);
+
+	*numoff = i;
+	*numlen = getnum(data + i, dlen - i, array, term);
+	if (!*numlen)
+		return -1;
+
+	DEBUGP("Match succeeded!\n");
+	return 1;
+}
+
+/* Look up to see if we're just after a \n. */
+static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir)
+{
+	unsigned int i;
+
+	for (i = 0; i < info->seq_aft_nl_num[dir]; i++)
+		if (info->seq_aft_nl[dir][i] == seq)
+			return 1;
+	return 0;
+}
+
+/* We don't update if it's older than what we have. */
+static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir)
+{
+	unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
+
+	/* Look for oldest: if we find exact match, we're done. */
+	for (i = 0; i < info->seq_aft_nl_num[dir]; i++) {
+		if (info->seq_aft_nl[dir][i] == nl_seq)
+			return;
+
+		if (oldest == info->seq_aft_nl_num[dir]
+		    || before(info->seq_aft_nl[dir][i], oldest))
+			oldest = i;
+	}
+
+	if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+		info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
+	else if (oldest != NUM_SEQ_TO_REMEMBER)
+		info->seq_aft_nl[dir][oldest] = nl_seq;
+}
+
+static int help(struct sk_buff **pskb,
+		struct ip_conntrack *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff, datalen;
+	struct tcphdr _tcph, *th;
+	char *fb_ptr;
+	int ret;
+	u32 seq, array[6] = { 0 };
+	int dir = CTINFO2DIR(ctinfo);
+	unsigned int matchlen, matchoff;
+	struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info;
+	struct ip_conntrack_expect *exp;
+	unsigned int i;
+	int found = 0, ends_in_nl;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED
+	    && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) {
+		DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo);
+		return NF_ACCEPT;
+	}
+
+	th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return NF_ACCEPT;
+
+	dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
+	/* No data? */
+	if (dataoff >= (*pskb)->len) {
+		DEBUGP("ftp: pskblen = %u\n", (*pskb)->len);
+		return NF_ACCEPT;
+	}
+	datalen = (*pskb)->len - dataoff;
+
+	LOCK_BH(&ip_ftp_lock);
+	fb_ptr = skb_header_pointer(*pskb, dataoff,
+				    (*pskb)->len - dataoff, ftp_buffer);
+	BUG_ON(fb_ptr == NULL);
+
+	ends_in_nl = (fb_ptr[datalen - 1] == '\n');
+	seq = ntohl(th->seq) + datalen;
+
+	/* Look up to see if we're just after a \n. */
+	if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) {
+		/* Now if this ends in \n, update ftp info. */
+		DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n",
+		       ct_ftp_info->seq_aft_nl[0][dir] 
+		       old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl);
+		ret = NF_ACCEPT;
+		goto out_update_nl;
+	}
+
+	/* Initialize IP array to expected address (it's not mentioned
+           in EPSV responses) */
+	array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF;
+	array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF;
+	array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF;
+	array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF;
+
+	for (i = 0; i < ARRAY_SIZE(search); i++) {
+		if (search[i].dir != dir) continue;
+
+		found = find_pattern(fb_ptr, (*pskb)->len - dataoff,
+				     search[i].pattern,
+				     search[i].plen,
+				     search[i].skip,
+				     search[i].term,
+				     &matchoff, &matchlen,
+				     array,
+				     search[i].getnum);
+		if (found) break;
+	}
+	if (found == -1) {
+		/* We don't usually drop packets.  After all, this is
+		   connection tracking, not packet filtering.
+		   However, it is necessary for accurate tracking in
+		   this case. */
+		if (net_ratelimit())
+			printk("conntrack_ftp: partial %s %u+%u\n",
+			       search[i].pattern,
+			       ntohl(th->seq), datalen);
+		ret = NF_DROP;
+		goto out;
+	} else if (found == 0) { /* No match */
+		ret = NF_ACCEPT;
+		goto out_update_nl;
+	}
+
+	DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n",
+	       fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff);
+			 
+	/* Allocate expectation which will be inserted */
+	exp = ip_conntrack_expect_alloc();
+	if (exp == NULL) {
+		ret = NF_DROP;
+		goto out;
+	}
+
+	/* We refer to the reverse direction ("!dir") tuples here,
+	 * because we're expecting something in the other direction.
+	 * Doesn't matter unless NAT is happening.  */
+	exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip;
+
+	if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3])
+	    != ct->tuplehash[dir].tuple.src.ip) {
+		/* Enrico Scholz's passive FTP to partially RNAT'd ftp
+		   server: it really wants us to connect to a
+		   different IP address.  Simply don't record it for
+		   NAT. */
+		DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n",
+		       array[0], array[1], array[2], array[3],
+		       NIPQUAD(ct->tuplehash[dir].tuple.src.ip));
+
+		/* Thanks to Cristiano Lincoln Mattos
+		   <lincoln@cesar.org.br> for reporting this potential
+		   problem (DMZ machines opening holes to internal
+		   networks, or the packet filter itself). */
+		if (!loose) {
+			ret = NF_ACCEPT;
+			ip_conntrack_expect_free(exp);
+			goto out_update_nl;
+		}
+		exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16)
+					 | (array[2] << 8) | array[3]);
+	}
+
+	exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip;
+	exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]);
+	exp->tuple.src.u.tcp.port = 0; /* Don't care. */
+	exp->tuple.dst.protonum = IPPROTO_TCP;
+	exp->mask = ((struct ip_conntrack_tuple)
+		{ { 0xFFFFFFFF, { 0 } },
+		  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
+
+	exp->expectfn = NULL;
+	exp->master = ct;
+
+	/* Now, NAT might want to mangle the packet, and register the
+	 * (possibly changed) expectation itself. */
+	if (ip_nat_ftp_hook)
+		ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype,
+				      matchoff, matchlen, exp, &seq);
+	else {
+		/* Can't expect this?  Best to drop packet now. */
+		if (ip_conntrack_expect_related(exp) != 0) {
+			ip_conntrack_expect_free(exp);
+			ret = NF_DROP;
+		} else
+			ret = NF_ACCEPT;
+	}
+
+out_update_nl:
+	/* Now if this ends in \n, update ftp info.  Seq may have been
+	 * adjusted by NAT code. */
+	if (ends_in_nl)
+		update_nl_seq(seq, ct_ftp_info,dir);
+ out:
+	UNLOCK_BH(&ip_ftp_lock);
+	return ret;
+}
+
+static struct ip_conntrack_helper ftp[MAX_PORTS];
+static char ftp_names[MAX_PORTS][10];
+
+/* Not __exit: called from init() */
+static void fini(void)
+{
+	int i;
+	for (i = 0; i < ports_c; i++) {
+		DEBUGP("ip_ct_ftp: unregistering helper for port %d\n",
+				ports[i]);
+		ip_conntrack_helper_unregister(&ftp[i]);
+	}
+}
+
+static int __init init(void)
+{
+	int i, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = FTP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		ftp[i].tuple.src.u.tcp.port = htons(ports[i]);
+		ftp[i].tuple.dst.protonum = IPPROTO_TCP;
+		ftp[i].mask.src.u.tcp.port = 0xFFFF;
+		ftp[i].mask.dst.protonum = 0xFF;
+		ftp[i].max_expected = 1;
+		ftp[i].timeout = 5 * 60; /* 5 minutes */
+		ftp[i].me = THIS_MODULE;
+		ftp[i].help = help;
+
+		tmpname = &ftp_names[i][0];
+		if (ports[i] == FTP_PORT)
+			sprintf(tmpname, "ftp");
+		else
+			sprintf(tmpname, "ftp-%d", ports[i]);
+		ftp[i].name = tmpname;
+
+		DEBUGP("ip_ct_ftp: registering helper for port %d\n", 
+				ports[i]);
+		ret = ip_conntrack_helper_register(&ftp[i]);
+
+		if (ret) {
+			fini();
+			return ret;
+		}
+	}
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c
new file mode 100644
index 000000000000..33cc7348b6ee
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_irc.c
@@ -0,0 +1,313 @@
+/* IRC extension for IP connection tracking, Version 1.21
+ * (C) 2000-2002 by Harald Welte <laforge@gnumonks.org>
+ * based on RR's ip_conntrack_ftp.c	
+ *
+ * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ **
+ *	Module load syntax:
+ * 	insmod ip_conntrack_irc.o ports=port1,port2,...port<MAX_PORTS>
+ *			    max_dcc_channels=n dcc_timeout=secs
+ *	
+ * 	please give the ports of all IRC servers You wish to connect to.
+ *	If You don't specify ports, the default will be port 6667.
+ *	With max_dcc_channels you can define the maximum number of not
+ *	yet answered DCC channels per IRC session (default 8).
+ *	With dcc_timeout you can specify how long the system waits for 
+ *	an expected DCC channel (default 300 seconds).
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+#include <linux/moduleparam.h>
+
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+static int max_dcc_channels = 8;
+static unsigned int dcc_timeout = 300;
+/* This is slow, but it's simple. --RR */
+static char irc_buffer[65536];
+static DECLARE_LOCK(irc_buffer_lock);
+
+unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb,
+				enum ip_conntrack_info ctinfo,
+				unsigned int matchoff,
+				unsigned int matchlen,
+				struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_irc_hook);
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IRC (DCC) connection tracking helper");
+MODULE_LICENSE("GPL");
+module_param_array(ports, int, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of IRC servers");
+module_param(max_dcc_channels, int, 0400);
+MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session");
+module_param(dcc_timeout, int, 0400);
+MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels");
+
+static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " };
+#define MINMATCHLEN	5
+
+#if 0
+#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \
+                                       __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int parse_dcc(char *data, char *data_end, u_int32_t *ip,
+		     u_int16_t *port, char **ad_beg_p, char **ad_end_p)
+/* tries to get the ip_addr and port out of a dcc command
+   return value: -1 on failure, 0 on success 
+	data		pointer to first byte of DCC command data
+	data_end	pointer to last byte of dcc command data
+	ip		returns parsed ip of dcc command
+	port		returns parsed port of dcc command
+	ad_beg_p	returns pointer to first byte of addr data
+	ad_end_p	returns pointer to last byte of addr data */
+{
+
+	/* at least 12: "AAAAAAAA P\1\n" */
+	while (*data++ != ' ')
+		if (data > data_end - 12)
+			return -1;
+
+	*ad_beg_p = data;
+	*ip = simple_strtoul(data, &data, 10);
+
+	/* skip blanks between ip and port */
+	while (*data == ' ') {
+		if (data >= data_end) 
+			return -1;
+		data++;
+	}
+
+	*port = simple_strtoul(data, &data, 10);
+	*ad_end_p = data;
+
+	return 0;
+}
+
+static int help(struct sk_buff **pskb,
+		struct ip_conntrack *ct, enum ip_conntrack_info ctinfo)
+{
+	unsigned int dataoff;
+	struct tcphdr _tcph, *th;
+	char *data, *data_limit, *ib_ptr;
+	int dir = CTINFO2DIR(ctinfo);
+	struct ip_conntrack_expect *exp;
+	u32 seq;
+	u_int32_t dcc_ip;
+	u_int16_t dcc_port;
+	int i, ret = NF_ACCEPT;
+	char *addr_beg_p, *addr_end_p;
+
+	DEBUGP("entered\n");
+
+	/* If packet is coming from IRC server */
+	if (dir == IP_CT_DIR_REPLY)
+		return NF_ACCEPT;
+
+	/* Until there's been traffic both ways, don't look in packets. */
+	if (ctinfo != IP_CT_ESTABLISHED
+	    && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) {
+		DEBUGP("Conntrackinfo = %u\n", ctinfo);
+		return NF_ACCEPT;
+	}
+
+	/* Not a full tcp header? */
+	th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		return NF_ACCEPT;
+
+	/* No data? */
+	dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4;
+	if (dataoff >= (*pskb)->len)
+		return NF_ACCEPT;
+
+	LOCK_BH(&irc_buffer_lock);
+	ib_ptr = skb_header_pointer(*pskb, dataoff,
+				    (*pskb)->len - dataoff, irc_buffer);
+	BUG_ON(ib_ptr == NULL);
+
+	data = ib_ptr;
+	data_limit = ib_ptr + (*pskb)->len - dataoff;
+
+	/* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24
+	 * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */
+	while (data < (data_limit - (19 + MINMATCHLEN))) {
+		if (memcmp(data, "\1DCC ", 5)) {
+			data++;
+			continue;
+		}
+
+		data += 5;
+		/* we have at least (19+MINMATCHLEN)-5 bytes valid data left */
+
+		DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n",
+			NIPQUAD(iph->saddr), ntohs(th->source),
+			NIPQUAD(iph->daddr), ntohs(th->dest));
+
+		for (i = 0; i < ARRAY_SIZE(dccprotos); i++) {
+			if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) {
+				/* no match */
+				continue;
+			}
+
+			DEBUGP("DCC %s detected\n", dccprotos[i]);
+			data += strlen(dccprotos[i]);
+			/* we have at least 
+			 * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid
+			 * data left (== 14/13 bytes) */
+			if (parse_dcc((char *)data, data_limit, &dcc_ip,
+				       &dcc_port, &addr_beg_p, &addr_end_p)) {
+				/* unable to parse */
+				DEBUGP("unable to parse dcc command\n");
+				continue;
+			}
+			DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n",
+				HIPQUAD(dcc_ip), dcc_port);
+
+			/* dcc_ip can be the internal OR external (NAT'ed) IP
+			 * Tiago Sousa <mirage@kaotik.org> */
+			if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip)
+			    && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) {
+				if (net_ratelimit())
+					printk(KERN_WARNING
+						"Forged DCC command from "
+						"%u.%u.%u.%u: %u.%u.%u.%u:%u\n",
+				NIPQUAD(ct->tuplehash[dir].tuple.src.ip),
+						HIPQUAD(dcc_ip), dcc_port);
+
+				continue;
+			}
+
+			exp = ip_conntrack_expect_alloc();
+			if (exp == NULL) {
+				ret = NF_DROP;
+				goto out;
+			}
+
+			/* save position of address in dcc string,
+			 * necessary for NAT */
+			DEBUGP("tcph->seq = %u\n", th->seq);
+			seq = ntohl(th->seq) + (addr_beg_p - ib_ptr);
+
+			/* We refer to the reverse direction ("!dir")
+			 * tuples here, because we're expecting
+			 * something in the other * direction.
+			 * Doesn't matter unless NAT is happening.  */
+			exp->tuple = ((struct ip_conntrack_tuple)
+				{ { 0, { 0 } },
+				  { ct->tuplehash[!dir].tuple.dst.ip,
+				    { .tcp = { htons(dcc_port) } },
+				    IPPROTO_TCP }});
+			exp->mask = ((struct ip_conntrack_tuple)
+				{ { 0, { 0 } },
+				  { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }});
+			exp->expectfn = NULL;
+			exp->master = ct;
+			if (ip_nat_irc_hook)
+				ret = ip_nat_irc_hook(pskb, ctinfo, 
+						      addr_beg_p - ib_ptr,
+						      addr_end_p - addr_beg_p,
+						      exp);
+			else if (ip_conntrack_expect_related(exp) != 0) {
+				ip_conntrack_expect_free(exp);
+				ret = NF_DROP;
+			}
+			goto out;
+		} /* for .. NUM_DCCPROTO */
+	} /* while data < ... */
+
+ out:
+	UNLOCK_BH(&irc_buffer_lock);
+	return ret;
+}
+
+static struct ip_conntrack_helper irc_helpers[MAX_PORTS];
+static char irc_names[MAX_PORTS][10];
+
+static void fini(void);
+
+static int __init init(void)
+{
+	int i, ret;
+	struct ip_conntrack_helper *hlpr;
+	char *tmpname;
+
+	if (max_dcc_channels < 1) {
+		printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n");
+		return -EBUSY;
+	}
+	if (dcc_timeout < 0) {
+		printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n");
+		return -EBUSY;
+	}
+	
+	/* If no port given, default to standard irc port */
+	if (ports_c == 0)
+		ports[ports_c++] = IRC_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		hlpr = &irc_helpers[i];
+		hlpr->tuple.src.u.tcp.port = htons(ports[i]);
+		hlpr->tuple.dst.protonum = IPPROTO_TCP;
+		hlpr->mask.src.u.tcp.port = 0xFFFF;
+		hlpr->mask.dst.protonum = 0xFF;
+		hlpr->max_expected = max_dcc_channels;
+		hlpr->timeout = dcc_timeout;
+		hlpr->me = THIS_MODULE;
+		hlpr->help = help;
+
+		tmpname = &irc_names[i][0];
+		if (ports[i] == IRC_PORT)
+			sprintf(tmpname, "irc");
+		else
+			sprintf(tmpname, "irc-%d", i);
+		hlpr->name = tmpname;
+
+		DEBUGP("port #%d: %d\n", i, ports[i]);
+
+		ret = ip_conntrack_helper_register(hlpr);
+
+		if (ret) {
+			printk("ip_conntrack_irc: ERROR registering port %d\n",
+				ports[i]);
+			fini();
+			return -EBUSY;
+		}
+	}
+	return 0;
+}
+
+/* This function is intentionally _NOT_ defined as __exit, because 
+ * it is needed by the init function */
+static void fini(void)
+{
+	int i;
+	for (i = 0; i < ports_c; i++) {
+		DEBUGP("unregistering port %d\n",
+		       ports[i]);
+		ip_conntrack_helper_unregister(&irc_helpers[i]);
+	}
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
new file mode 100644
index 000000000000..88c3712bd251
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c
@@ -0,0 +1,75 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+unsigned long ip_ct_generic_timeout = 600*HZ;
+
+static int generic_pkt_to_tuple(const struct sk_buff *skb,
+				unsigned int dataoff,
+				struct ip_conntrack_tuple *tuple)
+{
+	tuple->src.u.all = 0;
+	tuple->dst.u.all = 0;
+
+	return 1;
+}
+
+static int generic_invert_tuple(struct ip_conntrack_tuple *tuple,
+				const struct ip_conntrack_tuple *orig)
+{
+	tuple->src.u.all = 0;
+	tuple->dst.u.all = 0;
+
+	return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int generic_print_tuple(struct seq_file *s,
+			       const struct ip_conntrack_tuple *tuple)
+{
+	return 0;
+}
+
+/* Print out the private part of the conntrack. */
+static int generic_print_conntrack(struct seq_file *s,
+				   const struct ip_conntrack *state)
+{
+	return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int packet(struct ip_conntrack *conntrack,
+		  const struct sk_buff *skb,
+		  enum ip_conntrack_info ctinfo)
+{
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout);
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
+{
+	return 1;
+}
+
+struct ip_conntrack_protocol ip_conntrack_generic_protocol =
+{
+	.proto			= 0,
+	.name			= "unknown",
+	.pkt_to_tuple		= generic_pkt_to_tuple,
+	.invert_tuple		= generic_invert_tuple,
+	.print_tuple		= generic_print_tuple,
+	.print_conntrack	= generic_print_conntrack,
+	.packet			= packet,
+	.new			= new,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
new file mode 100644
index 000000000000..602c74db3252
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -0,0 +1,279 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/icmp.h>
+#include <linux/seq_file.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+unsigned long ip_ct_icmp_timeout = 30*HZ;
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int icmp_pkt_to_tuple(const struct sk_buff *skb,
+			     unsigned int dataoff,
+			     struct ip_conntrack_tuple *tuple)
+{
+	struct icmphdr _hdr, *hp;
+
+	hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return 0;
+
+	tuple->dst.u.icmp.type = hp->type;
+	tuple->src.u.icmp.id = hp->un.echo.id;
+	tuple->dst.u.icmp.code = hp->code;
+
+	return 1;
+}
+
+static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple,
+			     const struct ip_conntrack_tuple *orig)
+{
+	/* Add 1; spaces filled with 0. */
+	static u_int8_t invmap[]
+		= { [ICMP_ECHO] = ICMP_ECHOREPLY + 1,
+		    [ICMP_ECHOREPLY] = ICMP_ECHO + 1,
+		    [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1,
+		    [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1,
+		    [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1,
+		    [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1,
+		    [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1,
+		    [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1};
+
+	if (orig->dst.u.icmp.type >= sizeof(invmap)
+	    || !invmap[orig->dst.u.icmp.type])
+		return 0;
+
+	tuple->src.u.icmp.id = orig->src.u.icmp.id;
+	tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1;
+	tuple->dst.u.icmp.code = orig->dst.u.icmp.code;
+	return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int icmp_print_tuple(struct seq_file *s,
+			    const struct ip_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "type=%u code=%u id=%u ",
+			  tuple->dst.u.icmp.type,
+			  tuple->dst.u.icmp.code,
+			  ntohs(tuple->src.u.icmp.id));
+}
+
+/* Print out the private part of the conntrack. */
+static int icmp_print_conntrack(struct seq_file *s,
+				const struct ip_conntrack *conntrack)
+{
+	return 0;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int icmp_packet(struct ip_conntrack *ct,
+		       const struct sk_buff *skb,
+		       enum ip_conntrack_info ctinfo)
+{
+	/* Try to delete connection immediately after all replies:
+           won't actually vanish as we still have skb, and del_timer
+           means this will only run once even if count hits zero twice
+           (theoretically possible with SMP) */
+	if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) {
+		if (atomic_dec_and_test(&ct->proto.icmp.count)
+		    && del_timer(&ct->timeout))
+			ct->timeout.function((unsigned long)ct);
+	} else {
+		atomic_inc(&ct->proto.icmp.count);
+		ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
+	}
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int icmp_new(struct ip_conntrack *conntrack,
+		    const struct sk_buff *skb)
+{
+	static u_int8_t valid_new[]
+		= { [ICMP_ECHO] = 1,
+		    [ICMP_TIMESTAMP] = 1,
+		    [ICMP_INFO_REQUEST] = 1,
+		    [ICMP_ADDRESS] = 1 };
+
+	if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
+	    || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) {
+		/* Can't create a new ICMP `conn' with this. */
+		DEBUGP("icmp: can't create new conn with type %u\n",
+		       conntrack->tuplehash[0].tuple.dst.u.icmp.type);
+		DUMP_TUPLE(&conntrack->tuplehash[0].tuple);
+		return 0;
+	}
+	atomic_set(&conntrack->proto.icmp.count, 0);
+	return 1;
+}
+
+static int
+icmp_error_message(struct sk_buff *skb,
+		   enum ip_conntrack_info *ctinfo,
+		   unsigned int hooknum)
+{
+	struct ip_conntrack_tuple innertuple, origtuple;
+	struct {
+		struct icmphdr icmp;
+		struct iphdr ip;
+	} _in, *inside;
+	struct ip_conntrack_protocol *innerproto;
+	struct ip_conntrack_tuple_hash *h;
+	int dataoff;
+
+	IP_NF_ASSERT(skb->nfct == NULL);
+
+	/* Not enough header? */
+	inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in);
+	if (inside == NULL)
+		return NF_ACCEPT;
+
+	/* Ignore ICMP's containing fragments (shouldn't happen) */
+	if (inside->ip.frag_off & htons(IP_OFFSET)) {
+		DEBUGP("icmp_error_track: fragment of proto %u\n",
+		       inside->ip.protocol);
+		return NF_ACCEPT;
+	}
+
+	innerproto = ip_ct_find_proto(inside->ip.protocol);
+	dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4;
+	/* Are they talking about one of our connections? */
+	if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) {
+		DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol);
+		return NF_ACCEPT;
+	}
+
+	/* Ordinarily, we'd expect the inverted tupleproto, but it's
+	   been preserved inside the ICMP. */
+	if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
+		DEBUGP("icmp_error_track: Can't invert tuple\n");
+		return NF_ACCEPT;
+	}
+
+	*ctinfo = IP_CT_RELATED;
+
+	h = ip_conntrack_find_get(&innertuple, NULL);
+	if (!h) {
+		/* Locally generated ICMPs will match inverted if they
+		   haven't been SNAT'ed yet */
+		/* FIXME: NAT code has to handle half-done double NAT --RR */
+		if (hooknum == NF_IP_LOCAL_OUT)
+			h = ip_conntrack_find_get(&origtuple, NULL);
+
+		if (!h) {
+			DEBUGP("icmp_error_track: no match\n");
+			return NF_ACCEPT;
+		}
+		/* Reverse direction from that found */
+		if (DIRECTION(h) != IP_CT_DIR_REPLY)
+			*ctinfo += IP_CT_IS_REPLY;
+	} else {
+		if (DIRECTION(h) == IP_CT_DIR_REPLY)
+			*ctinfo += IP_CT_IS_REPLY;
+	}
+
+	/* Update skb to refer to this connection */
+	skb->nfct = &tuplehash_to_ctrack(h)->ct_general;
+	skb->nfctinfo = *ctinfo;
+	return -NF_ACCEPT;
+}
+
+/* Small and modified version of icmp_rcv */
+static int
+icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
+	   unsigned int hooknum)
+{
+	struct icmphdr _ih, *icmph;
+
+	/* Not enough header? */
+	icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih);
+	if (icmph == NULL) {
+		if (LOG_INVALID(IPPROTO_ICMP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+				      "ip_ct_icmp: short packet ");
+		return -NF_ACCEPT;
+	}
+
+	/* See ip_conntrack_proto_tcp.c */
+	if (hooknum != NF_IP_PRE_ROUTING)
+		goto checksum_skipped;
+
+	switch (skb->ip_summed) {
+	case CHECKSUM_HW:
+		if (!(u16)csum_fold(skb->csum)) 
+			break;
+		if (LOG_INVALID(IPPROTO_ICMP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				      "ip_ct_icmp: bad HW ICMP checksum ");
+		return -NF_ACCEPT;
+	case CHECKSUM_NONE:
+		if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) {
+			if (LOG_INVALID(IPPROTO_ICMP))
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+					      "ip_ct_icmp: bad ICMP checksum ");
+			return -NF_ACCEPT;
+		}
+	default:
+		break;
+	}
+
+checksum_skipped:
+	/*
+	 *	18 is the highest 'known' ICMP type. Anything else is a mystery
+	 *
+	 *	RFC 1122: 3.2.2  Unknown ICMP messages types MUST be silently
+	 *		  discarded.
+	 */
+	if (icmph->type > NR_ICMP_TYPES) {
+		if (LOG_INVALID(IPPROTO_ICMP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+				      "ip_ct_icmp: invalid ICMP type ");
+		return -NF_ACCEPT;
+	}
+
+	/* Need to track icmp error message? */
+	if (icmph->type != ICMP_DEST_UNREACH
+	    && icmph->type != ICMP_SOURCE_QUENCH
+	    && icmph->type != ICMP_TIME_EXCEEDED
+	    && icmph->type != ICMP_PARAMETERPROB
+	    && icmph->type != ICMP_REDIRECT)
+		return NF_ACCEPT;
+
+	return icmp_error_message(skb, ctinfo, hooknum);
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_icmp =
+{
+	.proto 			= IPPROTO_ICMP,
+	.name 			= "icmp",
+	.pkt_to_tuple		= icmp_pkt_to_tuple,
+	.invert_tuple		= icmp_invert_tuple,
+	.print_tuple		= icmp_print_tuple,
+	.print_conntrack	= icmp_print_conntrack,
+	.packet			= icmp_packet,
+	.new			= icmp_new,
+	.error			= icmp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
new file mode 100644
index 000000000000..ff8c34a860ff
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -0,0 +1,649 @@
+/*
+ * Connection tracking protocol helper module for SCTP.
+ * 
+ * SCTP is defined in RFC 2960. References to various sections in this code 
+ * are to this RFC.
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Added support for proc manipulation of timeouts.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/sctp.h>
+#include <linux/string.h>
+#include <linux/seq_file.h>
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#if 0
+#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.sctp */
+static DECLARE_RWLOCK(sctp_lock);
+
+/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+   closely.  They're more complex. --RR 
+
+   And so for me for SCTP :D -Kiran */
+
+static const char *sctp_conntrack_names[] = {
+	"NONE",
+	"CLOSED",
+	"COOKIE_WAIT",
+	"COOKIE_ECHOED",
+	"ESTABLISHED",
+	"SHUTDOWN_SENT",
+	"SHUTDOWN_RECD",
+	"SHUTDOWN_ACK_SENT",
+};
+
+#define SECS  * HZ
+#define MINS  * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS  * 24 HOURS
+
+static unsigned long ip_ct_sctp_timeout_closed            =  10 SECS;
+static unsigned long ip_ct_sctp_timeout_cookie_wait       =   3 SECS;
+static unsigned long ip_ct_sctp_timeout_cookie_echoed     =   3 SECS;
+static unsigned long ip_ct_sctp_timeout_established       =   5 DAYS;
+static unsigned long ip_ct_sctp_timeout_shutdown_sent     = 300 SECS / 1000;
+static unsigned long ip_ct_sctp_timeout_shutdown_recd     = 300 SECS / 1000;
+static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent =   3 SECS;
+
+static unsigned long * sctp_timeouts[]
+= { NULL,                                  /* SCTP_CONNTRACK_NONE  */
+    &ip_ct_sctp_timeout_closed,	           /* SCTP_CONNTRACK_CLOSED */
+    &ip_ct_sctp_timeout_cookie_wait,       /* SCTP_CONNTRACK_COOKIE_WAIT */
+    &ip_ct_sctp_timeout_cookie_echoed,     /* SCTP_CONNTRACK_COOKIE_ECHOED */
+    &ip_ct_sctp_timeout_established,       /* SCTP_CONNTRACK_ESTABLISHED */
+    &ip_ct_sctp_timeout_shutdown_sent,     /* SCTP_CONNTRACK_SHUTDOWN_SENT */
+    &ip_ct_sctp_timeout_shutdown_recd,     /* SCTP_CONNTRACK_SHUTDOWN_RECD */
+    &ip_ct_sctp_timeout_shutdown_ack_sent  /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */
+ };
+
+#define sNO SCTP_CONNTRACK_NONE
+#define	sCL SCTP_CONNTRACK_CLOSED
+#define	sCW SCTP_CONNTRACK_COOKIE_WAIT
+#define	sCE SCTP_CONNTRACK_COOKIE_ECHOED
+#define	sES SCTP_CONNTRACK_ESTABLISHED
+#define	sSS SCTP_CONNTRACK_SHUTDOWN_SENT
+#define	sSR SCTP_CONNTRACK_SHUTDOWN_RECD
+#define	sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT
+#define	sIV SCTP_CONNTRACK_MAX
+
+/* 
+	These are the descriptions of the states:
+
+NOTE: These state names are tantalizingly similar to the states of an 
+SCTP endpoint. But the interpretation of the states is a little different,
+considering that these are the states of the connection and not of an end 
+point. Please note the subtleties. -Kiran
+
+NONE              - Nothing so far.
+COOKIE WAIT       - We have seen an INIT chunk in the original direction, or also 
+                    an INIT_ACK chunk in the reply direction.
+COOKIE ECHOED     - We have seen a COOKIE_ECHO chunk in the original direction.
+ESTABLISHED       - We have seen a COOKIE_ACK in the reply direction.
+SHUTDOWN_SENT     - We have seen a SHUTDOWN chunk in the original direction.
+SHUTDOWN_RECD     - We have seen a SHUTDOWN chunk in the reply directoin.
+SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite
+                    to that of the SHUTDOWN chunk.
+CLOSED            - We have seen a SHUTDOWN_COMPLETE chunk in the direction of 
+                    the SHUTDOWN chunk. Connection is closed.
+*/
+
+/* TODO
+ - I have assumed that the first INIT is in the original direction. 
+ This messes things when an INIT comes in the reply direction in CLOSED
+ state.
+ - Check the error type in the reply dir before transitioning from 
+cookie echoed to closed.
+ - Sec 5.2.4 of RFC 2960
+ - Multi Homing support.
+*/
+
+/* SCTP conntrack state transitions */
+static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = {
+	{
+/*	ORIGINAL	*/
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init         */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA},
+/* init_ack     */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort        */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA},
+/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error        */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/
+/* cookie_echo  */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */
+/* cookie_ack   */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */
+/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+	},
+	{
+/*	REPLY	*/
+/*                  sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */
+/* init         */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */
+/* init_ack     */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},
+/* abort        */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL},
+/* shutdown     */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA},
+/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA},
+/* error        */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA},
+/* cookie_echo  */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */
+/* cookie_ack   */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA},
+/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL}
+	}
+};
+
+static int sctp_pkt_to_tuple(const struct sk_buff *skb,
+			     unsigned int dataoff,
+			     struct ip_conntrack_tuple *tuple)
+{
+	sctp_sctphdr_t _hdr, *hp;
+
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	/* Actually only need first 8 bytes. */
+	hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+	if (hp == NULL)
+		return 0;
+
+	tuple->src.u.sctp.port = hp->source;
+	tuple->dst.u.sctp.port = hp->dest;
+	return 1;
+}
+
+static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple,
+			     const struct ip_conntrack_tuple *orig)
+{
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	tuple->src.u.sctp.port = orig->dst.u.sctp.port;
+	tuple->dst.u.sctp.port = orig->src.u.sctp.port;
+	return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int sctp_print_tuple(struct seq_file *s,
+			    const struct ip_conntrack_tuple *tuple)
+{
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.sctp.port),
+			  ntohs(tuple->dst.u.sctp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int sctp_print_conntrack(struct seq_file *s,
+				const struct ip_conntrack *conntrack)
+{
+	enum sctp_conntrack state;
+
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	READ_LOCK(&sctp_lock);
+	state = conntrack->proto.sctp.state;
+	READ_UNLOCK(&sctp_lock);
+
+	return seq_printf(s, "%s ", sctp_conntrack_names[state]);
+}
+
+#define for_each_sctp_chunk(skb, sch, _sch, offset, count)		\
+for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0;	\
+	offset < skb->len &&						\
+	(sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch));	\
+	offset += (htons(sch->length) + 3) & ~3, count++)
+
+/* Some validity checks to make sure the chunks are fine */
+static int do_basic_checks(struct ip_conntrack *conntrack,
+			   const struct sk_buff *skb,
+			   char *map)
+{
+	u_int32_t offset, count;
+	sctp_chunkhdr_t _sch, *sch;
+	int flag;
+
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	flag = 0;
+
+	for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+		DEBUGP("Chunk Num: %d  Type: %d\n", count, sch->type);
+
+		if (sch->type == SCTP_CID_INIT 
+			|| sch->type == SCTP_CID_INIT_ACK
+			|| sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+			flag = 1;
+		}
+
+		/* Cookie Ack/Echo chunks not the first OR 
+		   Init / Init Ack / Shutdown compl chunks not the only chunks */
+		if ((sch->type == SCTP_CID_COOKIE_ACK 
+			|| sch->type == SCTP_CID_COOKIE_ECHO
+			|| flag)
+		     && count !=0 ) {
+			DEBUGP("Basic checks failed\n");
+			return 1;
+		}
+
+		if (map) {
+			set_bit(sch->type, (void *)map);
+		}
+	}
+
+	DEBUGP("Basic checks passed\n");
+	return 0;
+}
+
+static int new_state(enum ip_conntrack_dir dir,
+		     enum sctp_conntrack cur_state,
+		     int chunk_type)
+{
+	int i;
+
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	DEBUGP("Chunk type: %d\n", chunk_type);
+
+	switch (chunk_type) {
+		case SCTP_CID_INIT: 
+			DEBUGP("SCTP_CID_INIT\n");
+			i = 0; break;
+		case SCTP_CID_INIT_ACK: 
+			DEBUGP("SCTP_CID_INIT_ACK\n");
+			i = 1; break;
+		case SCTP_CID_ABORT: 
+			DEBUGP("SCTP_CID_ABORT\n");
+			i = 2; break;
+		case SCTP_CID_SHUTDOWN: 
+			DEBUGP("SCTP_CID_SHUTDOWN\n");
+			i = 3; break;
+		case SCTP_CID_SHUTDOWN_ACK: 
+			DEBUGP("SCTP_CID_SHUTDOWN_ACK\n");
+			i = 4; break;
+		case SCTP_CID_ERROR: 
+			DEBUGP("SCTP_CID_ERROR\n");
+			i = 5; break;
+		case SCTP_CID_COOKIE_ECHO: 
+			DEBUGP("SCTP_CID_COOKIE_ECHO\n");
+			i = 6; break;
+		case SCTP_CID_COOKIE_ACK: 
+			DEBUGP("SCTP_CID_COOKIE_ACK\n");
+			i = 7; break;
+		case SCTP_CID_SHUTDOWN_COMPLETE: 
+			DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n");
+			i = 8; break;
+		default:
+			/* Other chunks like DATA, SACK, HEARTBEAT and
+			its ACK do not cause a change in state */
+			DEBUGP("Unknown chunk type, Will stay in %s\n", 
+						sctp_conntrack_names[cur_state]);
+			return cur_state;
+	}
+
+	DEBUGP("dir: %d   cur_state: %s  chunk_type: %d  new_state: %s\n", 
+			dir, sctp_conntrack_names[cur_state], chunk_type,
+			sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]);
+
+	return sctp_conntracks[dir][i][cur_state];
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int sctp_packet(struct ip_conntrack *conntrack,
+		       const struct sk_buff *skb,
+		       enum ip_conntrack_info ctinfo)
+{
+	enum sctp_conntrack newconntrack, oldsctpstate;
+	struct iphdr *iph = skb->nh.iph;
+	sctp_sctphdr_t _sctph, *sh;
+	sctp_chunkhdr_t _sch, *sch;
+	u_int32_t offset, count;
+	char map[256 / sizeof (char)] = {0};
+
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
+	if (sh == NULL)
+		return -1;
+
+	if (do_basic_checks(conntrack, skb, map) != 0)
+		return -1;
+
+	/* Check the verification tag (Sec 8.5) */
+	if (!test_bit(SCTP_CID_INIT, (void *)map)
+		&& !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)
+		&& !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map)
+		&& !test_bit(SCTP_CID_ABORT, (void *)map)
+		&& !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map)
+		&& (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+		DEBUGP("Verification tag check failed\n");
+		return -1;
+	}
+
+	oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX;
+	for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+		WRITE_LOCK(&sctp_lock);
+
+		/* Special cases of Verification tag check (Sec 8.5.1) */
+		if (sch->type == SCTP_CID_INIT) {
+			/* Sec 8.5.1 (A) */
+			if (sh->vtag != 0) {
+				WRITE_UNLOCK(&sctp_lock);
+				return -1;
+			}
+		} else if (sch->type == SCTP_CID_ABORT) {
+			/* Sec 8.5.1 (B) */
+			if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+				&& !(sh->vtag == conntrack->proto.sctp.vtag
+							[1 - CTINFO2DIR(ctinfo)])) {
+				WRITE_UNLOCK(&sctp_lock);
+				return -1;
+			}
+		} else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) {
+			/* Sec 8.5.1 (C) */
+			if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])
+				&& !(sh->vtag == conntrack->proto.sctp.vtag
+							[1 - CTINFO2DIR(ctinfo)] 
+					&& (sch->flags & 1))) {
+				WRITE_UNLOCK(&sctp_lock);
+				return -1;
+			}
+		} else if (sch->type == SCTP_CID_COOKIE_ECHO) {
+			/* Sec 8.5.1 (D) */
+			if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) {
+				WRITE_UNLOCK(&sctp_lock);
+				return -1;
+			}
+		}
+
+		oldsctpstate = conntrack->proto.sctp.state;
+		newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type);
+
+		/* Invalid */
+		if (newconntrack == SCTP_CONNTRACK_MAX) {
+			DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n",
+			       CTINFO2DIR(ctinfo), sch->type, oldsctpstate);
+			WRITE_UNLOCK(&sctp_lock);
+			return -1;
+		}
+
+		/* If it is an INIT or an INIT ACK note down the vtag */
+		if (sch->type == SCTP_CID_INIT 
+			|| sch->type == SCTP_CID_INIT_ACK) {
+			sctp_inithdr_t _inithdr, *ih;
+
+			ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+			                        sizeof(_inithdr), &_inithdr);
+			if (ih == NULL) {
+					WRITE_UNLOCK(&sctp_lock);
+					return -1;
+			}
+			DEBUGP("Setting vtag %x for dir %d\n", 
+					ih->init_tag, !CTINFO2DIR(ctinfo));
+			conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag;
+		}
+
+		conntrack->proto.sctp.state = newconntrack;
+		WRITE_UNLOCK(&sctp_lock);
+	}
+
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]);
+
+	if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED
+		&& CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY
+		&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
+		DEBUGP("Setting assured bit\n");
+		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+	}
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int sctp_new(struct ip_conntrack *conntrack, 
+		    const struct sk_buff *skb)
+{
+	enum sctp_conntrack newconntrack;
+	struct iphdr *iph = skb->nh.iph;
+	sctp_sctphdr_t _sctph, *sh;
+	sctp_chunkhdr_t _sch, *sch;
+	u_int32_t offset, count;
+	char map[256 / sizeof (char)] = {0};
+
+	DEBUGP(__FUNCTION__);
+	DEBUGP("\n");
+
+	sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph);
+	if (sh == NULL)
+		return 0;
+
+	if (do_basic_checks(conntrack, skb, map) != 0)
+		return 0;
+
+	/* If an OOTB packet has any of these chunks discard (Sec 8.4) */
+	if ((test_bit (SCTP_CID_ABORT, (void *)map))
+		|| (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map))
+		|| (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) {
+		return 0;
+	}
+
+	newconntrack = SCTP_CONNTRACK_MAX;
+	for_each_sctp_chunk (skb, sch, _sch, offset, count) {
+		/* Don't need lock here: this conntrack not in circulation yet */
+		newconntrack = new_state (IP_CT_DIR_ORIGINAL, 
+						SCTP_CONNTRACK_NONE, sch->type);
+
+		/* Invalid: delete conntrack */
+		if (newconntrack == SCTP_CONNTRACK_MAX) {
+			DEBUGP("ip_conntrack_sctp: invalid new deleting.\n");
+			return 0;
+		}
+
+		/* Copy the vtag into the state info */
+		if (sch->type == SCTP_CID_INIT) {
+			if (sh->vtag == 0) {
+				sctp_inithdr_t _inithdr, *ih;
+
+				ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t),
+				                        sizeof(_inithdr), &_inithdr);
+				if (ih == NULL)
+					return 0;
+
+				DEBUGP("Setting vtag %x for new conn\n", 
+					ih->init_tag);
+
+				conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = 
+								ih->init_tag;
+			} else {
+				/* Sec 8.5.1 (A) */
+				return 0;
+			}
+		}
+		/* If it is a shutdown ack OOTB packet, we expect a return
+		   shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */
+		else {
+			DEBUGP("Setting vtag %x for new conn OOTB\n", 
+				sh->vtag);
+			conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag;
+		}
+
+		conntrack->proto.sctp.state = newconntrack;
+	}
+
+	return 1;
+}
+
+static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { 
+	.proto 		 = IPPROTO_SCTP, 
+	.name 		 = "sctp",
+	.pkt_to_tuple 	 = sctp_pkt_to_tuple, 
+	.invert_tuple 	 = sctp_invert_tuple, 
+	.print_tuple 	 = sctp_print_tuple, 
+	.print_conntrack = sctp_print_conntrack,
+	.packet 	 = sctp_packet, 
+	.new 		 = sctp_new, 
+	.destroy 	 = NULL, 
+	.me 		 = THIS_MODULE 
+};
+
+#ifdef CONFIG_SYSCTL
+static ctl_table ip_ct_sysctl_table[] = {
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED,
+		.procname	= "ip_conntrack_sctp_timeout_closed",
+		.data		= &ip_ct_sctp_timeout_closed,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT,
+		.procname	= "ip_conntrack_sctp_timeout_cookie_wait",
+		.data		= &ip_ct_sctp_timeout_cookie_wait,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED,
+		.procname	= "ip_conntrack_sctp_timeout_cookie_echoed",
+		.data		= &ip_ct_sctp_timeout_cookie_echoed,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED,
+		.procname	= "ip_conntrack_sctp_timeout_established",
+		.data		= &ip_ct_sctp_timeout_established,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT,
+		.procname	= "ip_conntrack_sctp_timeout_shutdown_sent",
+		.data		= &ip_ct_sctp_timeout_shutdown_sent,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD,
+		.procname	= "ip_conntrack_sctp_timeout_shutdown_recd",
+		.data		= &ip_ct_sctp_timeout_shutdown_recd,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT,
+		.procname	= "ip_conntrack_sctp_timeout_shutdown_ack_sent",
+		.data		= &ip_ct_sctp_timeout_shutdown_ack_sent,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_netfilter_table[] = {
+	{
+		.ctl_name	= NET_IPV4_NETFILTER,
+		.procname	= "netfilter",
+		.mode		= 0555,
+		.child		= ip_ct_sysctl_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_ipv4_table[] = {
+	{
+		.ctl_name	= NET_IPV4,
+		.procname	= "ipv4",
+		.mode		= 0555,
+		.child		= ip_ct_netfilter_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_net_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555, 
+		.child		= ip_ct_ipv4_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static struct ctl_table_header *ip_ct_sysctl_header;
+#endif
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp);
+	if (ret) {
+		printk("ip_conntrack_proto_sctp: protocol register failed\n");
+		goto out;
+	}
+
+#ifdef CONFIG_SYSCTL
+	ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+	if (ip_ct_sysctl_header == NULL) {
+		ret = -ENOMEM;
+		printk("ip_conntrack_proto_sctp: can't register to sysctl.\n");
+		goto cleanup;
+	}
+#endif
+
+	return ret;
+
+#ifdef CONFIG_SYSCTL
+ cleanup:
+	ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
+#endif
+ out:
+	DEBUGP("SCTP conntrack module loading %s\n", 
+					ret ? "failed": "succeeded");
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp);
+#ifdef CONFIG_SYSCTL
+ 	unregister_sysctl_table(ip_ct_sysctl_header);
+#endif
+	DEBUGP("SCTP conntrack module unloaded\n");
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP");
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
new file mode 100644
index 000000000000..e800b16fc920
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -0,0 +1,1098 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>:
+ *	- Real stateful connection tracking
+ *	- Modified state transitions table
+ *	- Window scaling support added
+ *	- SACK support added
+ *
+ * Willy Tarreau:
+ *	- State table bugfixes
+ *	- More robust state changes
+ *	- Tuning timer parameters
+ *
+ * version 2.2
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/module.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/spinlock.h>
+
+#include <net/tcp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#if 0
+#define DEBUGP printk
+#define DEBUGP_VARS
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Protects conntrack->proto.tcp */
+static DECLARE_RWLOCK(tcp_lock);
+
+/* "Be conservative in what you do, 
+    be liberal in what you accept from others." 
+    If it's non-zero, we mark only out of window RST segments as INVALID. */
+int ip_ct_tcp_be_liberal = 0;
+
+/* When connection is picked up from the middle, how many packets are required
+   to pass in each direction when we assume we are in sync - if any side uses
+   window scaling, we lost the game. 
+   If it is set to zero, we disable picking up already established 
+   connections. */
+int ip_ct_tcp_loose = 3;
+
+/* Max number of the retransmitted packets without receiving an (acceptable) 
+   ACK from the destination. If this number is reached, a shorter timer 
+   will be started. */
+int ip_ct_tcp_max_retrans = 3;
+
+  /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
+     closely.  They're more complex. --RR */
+
+static const char *tcp_conntrack_names[] = {
+	"NONE",
+	"SYN_SENT",
+	"SYN_RECV",
+	"ESTABLISHED",
+	"FIN_WAIT",
+	"CLOSE_WAIT",
+	"LAST_ACK",
+	"TIME_WAIT",
+	"CLOSE",
+	"LISTEN"
+};
+  
+#define SECS * HZ
+#define MINS * 60 SECS
+#define HOURS * 60 MINS
+#define DAYS * 24 HOURS
+
+unsigned long ip_ct_tcp_timeout_syn_sent =      2 MINS;
+unsigned long ip_ct_tcp_timeout_syn_recv =     60 SECS;
+unsigned long ip_ct_tcp_timeout_established =   5 DAYS;
+unsigned long ip_ct_tcp_timeout_fin_wait =      2 MINS;
+unsigned long ip_ct_tcp_timeout_close_wait =   60 SECS;
+unsigned long ip_ct_tcp_timeout_last_ack =     30 SECS;
+unsigned long ip_ct_tcp_timeout_time_wait =     2 MINS;
+unsigned long ip_ct_tcp_timeout_close =        10 SECS;
+
+/* RFC1122 says the R2 limit should be at least 100 seconds.
+   Linux uses 15 packets as limit, which corresponds 
+   to ~13-30min depending on RTO. */
+unsigned long ip_ct_tcp_timeout_max_retrans =     5 MINS;
+ 
+static unsigned long * tcp_timeouts[]
+= { NULL,                              /*      TCP_CONNTRACK_NONE */
+    &ip_ct_tcp_timeout_syn_sent,       /*      TCP_CONNTRACK_SYN_SENT, */
+    &ip_ct_tcp_timeout_syn_recv,       /*      TCP_CONNTRACK_SYN_RECV, */
+    &ip_ct_tcp_timeout_established,    /*      TCP_CONNTRACK_ESTABLISHED,      */
+    &ip_ct_tcp_timeout_fin_wait,       /*      TCP_CONNTRACK_FIN_WAIT, */
+    &ip_ct_tcp_timeout_close_wait,     /*      TCP_CONNTRACK_CLOSE_WAIT,       */
+    &ip_ct_tcp_timeout_last_ack,       /*      TCP_CONNTRACK_LAST_ACK, */
+    &ip_ct_tcp_timeout_time_wait,      /*      TCP_CONNTRACK_TIME_WAIT,        */
+    &ip_ct_tcp_timeout_close,          /*      TCP_CONNTRACK_CLOSE,    */
+    NULL,                              /*      TCP_CONNTRACK_LISTEN */
+ };
+ 
+#define sNO TCP_CONNTRACK_NONE
+#define sSS TCP_CONNTRACK_SYN_SENT
+#define sSR TCP_CONNTRACK_SYN_RECV
+#define sES TCP_CONNTRACK_ESTABLISHED
+#define sFW TCP_CONNTRACK_FIN_WAIT
+#define sCW TCP_CONNTRACK_CLOSE_WAIT
+#define sLA TCP_CONNTRACK_LAST_ACK
+#define sTW TCP_CONNTRACK_TIME_WAIT
+#define sCL TCP_CONNTRACK_CLOSE
+#define sLI TCP_CONNTRACK_LISTEN
+#define sIV TCP_CONNTRACK_MAX
+#define sIG TCP_CONNTRACK_IGNORE
+
+/* What TCP flags are set from RST/SYN/FIN/ACK. */
+enum tcp_bit_set {
+	TCP_SYN_SET,
+	TCP_SYNACK_SET,
+	TCP_FIN_SET,
+	TCP_ACK_SET,
+	TCP_RST_SET,
+	TCP_NONE_SET,
+};
+  
+/*
+ * The TCP state transition table needs a few words...
+ *
+ * We are the man in the middle. All the packets go through us
+ * but might get lost in transit to the destination.
+ * It is assumed that the destinations can't receive segments 
+ * we haven't seen.
+ *
+ * The checked segment is in window, but our windows are *not*
+ * equivalent with the ones of the sender/receiver. We always
+ * try to guess the state of the current sender.
+ *
+ * The meaning of the states are:
+ *
+ * NONE:	initial state
+ * SYN_SENT:	SYN-only packet seen 
+ * SYN_RECV:	SYN-ACK packet seen
+ * ESTABLISHED:	ACK packet seen
+ * FIN_WAIT:	FIN packet seen
+ * CLOSE_WAIT:	ACK seen (after FIN) 
+ * LAST_ACK:	FIN seen (after FIN)
+ * TIME_WAIT:	last ACK seen
+ * CLOSE:	closed connection
+ *
+ * LISTEN state is not used.
+ *
+ * Packets marked as IGNORED (sIG):
+ *	if they may be either invalid or valid 
+ *	and the receiver may send back a connection 
+ *	closing RST or a SYN/ACK.
+ *
+ * Packets marked as INVALID (sIV):
+ *	if they are invalid
+ *	or we do not support the request (simultaneous open)
+ */
+static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
+	{
+/* ORIGINAL */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*syn*/	   { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV },
+/*
+ *	sNO -> sSS	Initialize a new connection
+ *	sSS -> sSS	Retransmitted SYN
+ *	sSR -> sIG	Late retransmitted SYN?
+ *	sES -> sIG	Error: SYNs in window outside the SYN_SENT state
+ *			are errors. Receiver will reply with RST 
+ *			and close the connection.
+ *			Or we are not in sync and hold a dead connection.
+ *	sFW -> sIG
+ *	sCW -> sIG
+ *	sLA -> sIG
+ *	sTW -> sSS	Reopened connection (RFC 1122).
+ *	sCL -> sSS
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ * A SYN/ACK from the client is always invalid:
+ *	- either it tries to set up a simultaneous open, which is 
+ *	  not supported;
+ *	- or the firewall has just been inserted between the two hosts
+ *	  during the session set-up. The SYN will be retransmitted 
+ *	  by the true client (or it'll time out).
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ *	sNO -> sIV	Too late and no reason to do anything...
+ *	sSS -> sIV	Client migth not send FIN in this state:
+ *			we enforce waiting for a SYN/ACK reply first.
+ *	sSR -> sFW	Close started.
+ *	sES -> sFW	
+ *	sFW -> sLA	FIN seen in both directions, waiting for
+ *			the last ACK. 
+ *			Migth be a retransmitted FIN as well...
+ *	sCW -> sLA
+ *	sLA -> sLA	Retransmitted FIN. Remain in the same state.
+ *	sTW -> sTW
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*ack*/	   { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ *	sNO -> sES	Assumed.
+ *	sSS -> sIV	ACK is invalid: we haven't seen a SYN/ACK yet.
+ *	sSR -> sES	Established state is reached.
+ *	sES -> sES	:-)
+ *	sFW -> sCW	Normal close request answered by ACK.
+ *	sCW -> sCW
+ *	sLA -> sTW	Last ACK detected.
+ *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+	},
+	{
+/* REPLY */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*syn*/	   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV },
+/*
+ *	sNO -> sIV	Never reached.
+ *	sSS -> sIV	Simultaneous open, not supported
+ *	sSR -> sIV	Simultaneous open, not supported.
+ *	sES -> sIV	Server may not initiate a connection.
+ *	sFW -> sIV
+ *	sCW -> sIV
+ *	sLA -> sIV
+ *	sTW -> sIV	Reopened connection, but server may not do it.
+ *	sCL -> sIV
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV },
+/*
+ *	sSS -> sSR	Standard open.
+ *	sSR -> sSR	Retransmitted SYN/ACK.
+ *	sES -> sIG	Late retransmitted SYN/ACK?
+ *	sFW -> sIG	Might be SYN/ACK answering ignored SYN
+ *	sCW -> sIG
+ *	sLA -> sIG
+ *	sTW -> sIG
+ *	sCL -> sIG
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
+/*
+ *	sSS -> sIV	Server might not send FIN in this state.
+ *	sSR -> sFW	Close started.
+ *	sES -> sFW
+ *	sFW -> sLA	FIN seen in both directions.
+ *	sCW -> sLA
+ *	sLA -> sLA	Retransmitted FIN.
+ *	sTW -> sTW
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*ack*/	   { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV },
+/*
+ *	sSS -> sIV	Might be a half-open connection.
+ *	sSR -> sSR	Might answer late resent SYN.
+ *	sES -> sES	:-)
+ *	sFW -> sCW	Normal close request answered by ACK.
+ *	sCW -> sCW
+ *	sLA -> sTW	Last ACK detected.
+ *	sTW -> sTW	Retransmitted last ACK.
+ *	sCL -> sCL
+ */
+/* 	     sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI	*/
+/*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV },
+/*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
+  	}
+};
+
+static int tcp_pkt_to_tuple(const struct sk_buff *skb,
+			    unsigned int dataoff,
+			    struct ip_conntrack_tuple *tuple)
+{
+	struct tcphdr _hdr, *hp;
+
+	/* Actually only need first 8 bytes. */
+	hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
+	if (hp == NULL)
+		return 0;
+
+	tuple->src.u.tcp.port = hp->source;
+	tuple->dst.u.tcp.port = hp->dest;
+
+	return 1;
+}
+
+static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple,
+			    const struct ip_conntrack_tuple *orig)
+{
+	tuple->src.u.tcp.port = orig->dst.u.tcp.port;
+	tuple->dst.u.tcp.port = orig->src.u.tcp.port;
+	return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int tcp_print_tuple(struct seq_file *s,
+			   const struct ip_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.tcp.port),
+			  ntohs(tuple->dst.u.tcp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int tcp_print_conntrack(struct seq_file *s,
+			       const struct ip_conntrack *conntrack)
+{
+	enum tcp_conntrack state;
+
+	READ_LOCK(&tcp_lock);
+	state = conntrack->proto.tcp.state;
+	READ_UNLOCK(&tcp_lock);
+
+	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
+}
+
+static unsigned int get_conntrack_index(const struct tcphdr *tcph)
+{
+	if (tcph->rst) return TCP_RST_SET;
+	else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
+	else if (tcph->fin) return TCP_FIN_SET;
+	else if (tcph->ack) return TCP_ACK_SET;
+	else return TCP_NONE_SET;
+}
+
+/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
+   in IP Filter' by Guido van Rooij.
+   
+   http://www.nluug.nl/events/sane2000/papers.html
+   http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
+   
+   The boundaries and the conditions are changed according to RFC793:
+   the packet must intersect the window (i.e. segments may be
+   after the right or before the left edge) and thus receivers may ACK
+   segments after the right edge of the window.
+
+   	td_maxend = max(sack + max(win,1)) seen in reply packets
+	td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
+	td_maxwin += seq + len - sender.td_maxend
+			if seq + len > sender.td_maxend
+	td_end    = max(seq + len) seen in sent packets
+   
+   I.   Upper bound for valid data:	seq <= sender.td_maxend
+   II.  Lower bound for valid data:	seq + len >= sender.td_end - receiver.td_maxwin
+   III.	Upper bound for valid ack:      sack <= receiver.td_end
+   IV.	Lower bound for valid ack:	ack >= receiver.td_end - MAXACKWINDOW
+   	
+   where sack is the highest right edge of sack block found in the packet.
+   	
+   The upper bound limit for a valid ack is not ignored - 
+   we doesn't have to deal with fragments. 
+*/
+
+static inline __u32 segment_seq_plus_len(__u32 seq,
+					 size_t len,
+					 struct iphdr *iph,
+					 struct tcphdr *tcph)
+{
+	return (seq + len - (iph->ihl + tcph->doff)*4
+		+ (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
+}
+  
+/* Fixme: what about big packets? */
+#define MAXACKWINCONST			66000
+#define MAXACKWINDOW(sender)						\
+	((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin	\
+					      : MAXACKWINCONST)
+  
+/*
+ * Simplified tcp_parse_options routine from tcp_input.c
+ */
+static void tcp_options(const struct sk_buff *skb,
+			struct iphdr *iph,
+			struct tcphdr *tcph, 
+			struct ip_ct_tcp_state *state)
+{
+	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+	unsigned char *ptr;
+	int length = (tcph->doff*4) - sizeof(struct tcphdr);
+	
+	if (!length)
+		return;
+
+	ptr = skb_header_pointer(skb,
+				 (iph->ihl * 4) + sizeof(struct tcphdr),
+				 length, buff);
+	BUG_ON(ptr == NULL);
+
+	state->td_scale = 
+	state->flags = 0;
+	
+	while (length > 0) {
+		int opcode=*ptr++;
+		int opsize;
+		
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			opsize=*ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				break;	/* don't parse partial options */
+
+			if (opcode == TCPOPT_SACK_PERM 
+			    && opsize == TCPOLEN_SACK_PERM)
+				state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
+			else if (opcode == TCPOPT_WINDOW
+				 && opsize == TCPOLEN_WINDOW) {
+				state->td_scale = *(u_int8_t *)ptr;
+				
+				if (state->td_scale > 14) {
+					/* See RFC1323 */
+					state->td_scale = 14;
+				}
+				state->flags |=
+					IP_CT_TCP_FLAG_WINDOW_SCALE;
+			}
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+
+static void tcp_sack(const struct sk_buff *skb,
+		     struct iphdr *iph,
+		     struct tcphdr *tcph,
+		     __u32 *sack)
+{
+	unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
+	unsigned char *ptr;
+	int length = (tcph->doff*4) - sizeof(struct tcphdr);
+	__u32 tmp;
+
+	if (!length)
+		return;
+
+	ptr = skb_header_pointer(skb,
+				 (iph->ihl * 4) + sizeof(struct tcphdr),
+				 length, buff);
+	BUG_ON(ptr == NULL);
+
+	/* Fast path for timestamp-only option */
+	if (length == TCPOLEN_TSTAMP_ALIGNED*4
+	    && *(__u32 *)ptr ==
+	        __constant_ntohl((TCPOPT_NOP << 24) 
+	        		 | (TCPOPT_NOP << 16)
+	        		 | (TCPOPT_TIMESTAMP << 8)
+	        		 | TCPOLEN_TIMESTAMP))
+		return;
+		
+	while (length > 0) {
+		int opcode=*ptr++;
+		int opsize, i;
+		
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return;
+		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+			length--;
+			continue;
+		default:
+			opsize=*ptr++;
+			if (opsize < 2) /* "silly options" */
+				return;
+			if (opsize > length)
+				break;	/* don't parse partial options */
+
+			if (opcode == TCPOPT_SACK 
+			    && opsize >= (TCPOLEN_SACK_BASE 
+			    		  + TCPOLEN_SACK_PERBLOCK)
+			    && !((opsize - TCPOLEN_SACK_BASE) 
+			    	 % TCPOLEN_SACK_PERBLOCK)) {
+			    	for (i = 0;
+			    	     i < (opsize - TCPOLEN_SACK_BASE);
+			    	     i += TCPOLEN_SACK_PERBLOCK) {
+					tmp = ntohl(*((u_int32_t *)(ptr+i)+1));
+					
+					if (after(tmp, *sack))
+						*sack = tmp;
+				}
+				return;
+			}
+			ptr += opsize - 2;
+			length -= opsize;
+		}
+	}
+}
+
+static int tcp_in_window(struct ip_ct_tcp *state, 
+                         enum ip_conntrack_dir dir,
+                         unsigned int index,
+                         const struct sk_buff *skb,
+                         struct iphdr *iph,
+                         struct tcphdr *tcph)
+{
+	struct ip_ct_tcp_state *sender = &state->seen[dir];
+	struct ip_ct_tcp_state *receiver = &state->seen[!dir];
+	__u32 seq, ack, sack, end, win, swin;
+	int res;
+	
+	/*
+	 * Get the required data from the packet.
+	 */
+	seq = ntohl(tcph->seq);
+	ack = sack = ntohl(tcph->ack_seq);
+	win = ntohs(tcph->window);
+	end = segment_seq_plus_len(seq, skb->len, iph, tcph);
+	
+	if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
+		tcp_sack(skb, iph, tcph, &sack);
+		
+	DEBUGP("tcp_in_window: START\n");
+	DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+	       "seq=%u ack=%u sack=%u win=%u end=%u\n",
+		NIPQUAD(iph->saddr), ntohs(tcph->source), 
+		NIPQUAD(iph->daddr), ntohs(tcph->dest),
+		seq, ack, sack, win, end);
+	DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		sender->td_end, sender->td_maxend, sender->td_maxwin,
+		sender->td_scale, 
+		receiver->td_end, receiver->td_maxend, receiver->td_maxwin, 
+		receiver->td_scale);
+		
+	if (sender->td_end == 0) {
+		/*
+		 * Initialize sender data.
+		 */
+		if (tcph->syn && tcph->ack) {
+			/*
+			 * Outgoing SYN-ACK in reply to a SYN.
+			 */
+			sender->td_end = 
+			sender->td_maxend = end;
+			sender->td_maxwin = (win == 0 ? 1 : win);
+
+			tcp_options(skb, iph, tcph, sender);
+			/* 
+			 * RFC 1323:
+			 * Both sides must send the Window Scale option
+			 * to enable window scaling in either direction.
+			 */
+			if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
+			      && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
+				sender->td_scale = 
+				receiver->td_scale = 0;
+		} else {
+			/*
+			 * We are in the middle of a connection,
+			 * its history is lost for us.
+			 * Let's try to use the data from the packet.
+		 	 */
+			sender->td_end = end;
+			sender->td_maxwin = (win == 0 ? 1 : win);
+			sender->td_maxend = end + sender->td_maxwin;
+		}
+	} else if (((state->state == TCP_CONNTRACK_SYN_SENT
+		     && dir == IP_CT_DIR_ORIGINAL)
+		    || (state->state == TCP_CONNTRACK_SYN_RECV
+		        && dir == IP_CT_DIR_REPLY))
+		    && after(end, sender->td_end)) {
+		/*
+		 * RFC 793: "if a TCP is reinitialized ... then it need
+		 * not wait at all; it must only be sure to use sequence 
+		 * numbers larger than those recently used."
+		 */
+		sender->td_end =
+		sender->td_maxend = end;
+		sender->td_maxwin = (win == 0 ? 1 : win);
+
+		tcp_options(skb, iph, tcph, sender);
+	}
+	
+	if (!(tcph->ack)) {
+		/*
+		 * If there is no ACK, just pretend it was set and OK.
+		 */
+		ack = sack = receiver->td_end;
+	} else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == 
+		    (TCP_FLAG_ACK|TCP_FLAG_RST)) 
+		   && (ack == 0)) {
+		/*
+		 * Broken TCP stacks, that set ACK in RST packets as well
+		 * with zero ack value.
+		 */
+		ack = sack = receiver->td_end;
+	}
+
+	if (seq == end
+	    && (!tcph->rst 
+	        || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
+		/*
+		 * Packets contains no data: we assume it is valid
+		 * and check the ack value only.
+		 * However RST segments are always validated by their
+		 * SEQ number, except when seq == 0 (reset sent answering
+		 * SYN.
+		 */
+		seq = end = sender->td_end;
+		
+	DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+	       "seq=%u ack=%u sack =%u win=%u end=%u\n",
+		NIPQUAD(iph->saddr), ntohs(tcph->source),
+		NIPQUAD(iph->daddr), ntohs(tcph->dest),
+		seq, ack, sack, win, end);
+	DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
+	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		sender->td_end, sender->td_maxend, sender->td_maxwin,
+		sender->td_scale, 
+		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		receiver->td_scale);
+	
+	DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
+		before(seq, sender->td_maxend + 1),
+	    	after(end, sender->td_end - receiver->td_maxwin - 1),
+	    	before(sack, receiver->td_end + 1),
+	    	after(ack, receiver->td_end - MAXACKWINDOW(sender)));
+	
+	if (sender->loose || receiver->loose ||
+	    (before(seq, sender->td_maxend + 1) &&
+	     after(end, sender->td_end - receiver->td_maxwin - 1) &&
+	     before(sack, receiver->td_end + 1) &&
+	     after(ack, receiver->td_end - MAXACKWINDOW(sender)))) {
+	    	/*
+		 * Take into account window scaling (RFC 1323).
+		 */
+		if (!tcph->syn)
+			win <<= sender->td_scale;
+		
+		/*
+		 * Update sender data.
+		 */
+		swin = win + (sack - ack);
+		if (sender->td_maxwin < swin)
+			sender->td_maxwin = swin;
+		if (after(end, sender->td_end))
+			sender->td_end = end;
+		/*
+		 * Update receiver data.
+		 */
+		if (after(end, sender->td_maxend))
+			receiver->td_maxwin += end - sender->td_maxend;
+		if (after(sack + win, receiver->td_maxend - 1)) {
+			receiver->td_maxend = sack + win;
+			if (win == 0)
+				receiver->td_maxend++;
+		}
+
+		/* 
+		 * Check retransmissions.
+		 */
+		if (index == TCP_ACK_SET) {
+			if (state->last_dir == dir
+			    && state->last_seq == seq
+			    && state->last_ack == ack
+			    && state->last_end == end)
+				state->retrans++;
+			else {
+				state->last_dir = dir;
+				state->last_seq = seq;
+				state->last_ack = ack;
+				state->last_end = end;
+				state->retrans = 0;
+			}
+		}
+		/*
+		 * Close the window of disabled window tracking :-)
+		 */
+		if (sender->loose)
+			sender->loose--;
+		
+		res = 1;
+	} else {
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+			"ip_ct_tcp: %s ",
+			before(seq, sender->td_maxend + 1) ?
+			after(end, sender->td_end - receiver->td_maxwin - 1) ?
+			before(sack, receiver->td_end + 1) ?
+			after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+			: "ACK is under the lower bound (possible overly delayed ACK)"
+			: "ACK is over the upper bound (ACKed data not seen yet)"
+			: "SEQ is under the lower bound (already ACKed data retransmitted)"
+			: "SEQ is over the upper bound (over the window of the receiver)");
+
+		res = ip_ct_tcp_be_liberal;
+  	}
+  
+	DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u "
+	       "receiver end=%u maxend=%u maxwin=%u\n",
+		res, sender->td_end, sender->td_maxend, sender->td_maxwin, 
+		receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
+
+	return res;
+}
+
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+/* Update sender->td_end after NAT successfully mangled the packet */
+void ip_conntrack_tcp_update(struct sk_buff *skb,
+			     struct ip_conntrack *conntrack, 
+			     enum ip_conntrack_dir dir)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4;
+	__u32 end;
+#ifdef DEBUGP_VARS
+	struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir];
+	struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir];
+#endif
+
+	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph);
+	
+	WRITE_LOCK(&tcp_lock);
+	/*
+	 * We have to worry for the ack in the reply packet only...
+	 */
+	if (after(end, conntrack->proto.tcp.seen[dir].td_end))
+		conntrack->proto.tcp.seen[dir].td_end = end;
+	conntrack->proto.tcp.last_end = end;
+	WRITE_UNLOCK(&tcp_lock);
+	DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
+	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		sender->td_end, sender->td_maxend, sender->td_maxwin,
+		sender->td_scale, 
+		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		receiver->td_scale);
+}
+ 
+#endif
+
+#define	TH_FIN	0x01
+#define	TH_SYN	0x02
+#define	TH_RST	0x04
+#define	TH_PUSH	0x08
+#define	TH_ACK	0x10
+#define	TH_URG	0x20
+#define	TH_ECE	0x40
+#define	TH_CWR	0x80
+
+/* table of valid flag combinations - ECE and CWR are always valid */
+static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] =
+{
+	[TH_SYN]			= 1,
+	[TH_SYN|TH_ACK]			= 1,
+	[TH_RST]			= 1,
+	[TH_RST|TH_ACK]			= 1,
+	[TH_RST|TH_ACK|TH_PUSH]		= 1,
+	[TH_FIN|TH_ACK]			= 1,
+	[TH_ACK]			= 1,
+	[TH_ACK|TH_PUSH]		= 1,
+	[TH_ACK|TH_URG]			= 1,
+	[TH_ACK|TH_URG|TH_PUSH]		= 1,
+	[TH_FIN|TH_ACK|TH_PUSH]		= 1,
+	[TH_FIN|TH_ACK|TH_URG]		= 1,
+	[TH_FIN|TH_ACK|TH_URG|TH_PUSH]	= 1,
+};
+
+/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
+static int tcp_error(struct sk_buff *skb,
+		     enum ip_conntrack_info *ctinfo,
+		     unsigned int hooknum)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct tcphdr _tcph, *th;
+	unsigned int tcplen = skb->len - iph->ihl * 4;
+	u_int8_t tcpflags;
+
+	/* Smaller that minimal TCP header? */
+	th = skb_header_pointer(skb, iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				"ip_ct_tcp: short packet ");
+		return -NF_ACCEPT;
+  	}
+  
+	/* Not whole TCP header or malformed packet */
+	if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				"ip_ct_tcp: truncated/malformed packet ");
+		return -NF_ACCEPT;
+	}
+  
+	/* Checksum invalid? Ignore.
+	 * We skip checking packets on the outgoing path
+	 * because the semantic of CHECKSUM_HW is different there 
+	 * and moreover root might send raw packets.
+	 */
+	/* FIXME: Source route IP option packets --RR */
+	if (hooknum == NF_IP_PRE_ROUTING
+	    && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP,
+			         skb->ip_summed == CHECKSUM_HW ? skb->csum
+			      	 : skb_checksum(skb, iph->ihl*4, tcplen, 0))) {
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_tcp: bad TCP checksum ");
+		return -NF_ACCEPT;
+	}
+
+	/* Check TCP flags. */
+	tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR));
+	if (!tcp_valid_flags[tcpflags]) {
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_tcp: invalid TCP flag combination ");
+		return -NF_ACCEPT;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* Returns verdict for packet, or -1 for invalid. */
+static int tcp_packet(struct ip_conntrack *conntrack,
+		      const struct sk_buff *skb,
+		      enum ip_conntrack_info ctinfo)
+{
+	enum tcp_conntrack new_state, old_state;
+	enum ip_conntrack_dir dir;
+	struct iphdr *iph = skb->nh.iph;
+	struct tcphdr *th, _tcph;
+	unsigned long timeout;
+	unsigned int index;
+	
+	th = skb_header_pointer(skb, iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	BUG_ON(th == NULL);
+	
+	WRITE_LOCK(&tcp_lock);
+	old_state = conntrack->proto.tcp.state;
+	dir = CTINFO2DIR(ctinfo);
+	index = get_conntrack_index(th);
+	new_state = tcp_conntracks[dir][index][old_state];
+
+	switch (new_state) {
+	case TCP_CONNTRACK_IGNORE:
+		/* Either SYN in ORIGINAL
+		 * or SYN/ACK in REPLY. */
+		if (index == TCP_SYNACK_SET
+		    && conntrack->proto.tcp.last_index == TCP_SYN_SET
+		    && conntrack->proto.tcp.last_dir != dir
+		    && ntohl(th->ack_seq) ==
+		    	     conntrack->proto.tcp.last_end) {
+			/* This SYN/ACK acknowledges a SYN that we earlier 
+			 * ignored as invalid. This means that the client and
+			 * the server are both in sync, while the firewall is
+			 * not. We kill this session and block the SYN/ACK so
+			 * that the client cannot but retransmit its SYN and 
+			 * thus initiate a clean new session.
+			 */
+		    	WRITE_UNLOCK(&tcp_lock);
+			if (LOG_INVALID(IPPROTO_TCP))
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+					  "ip_ct_tcp: killing out of sync session ");
+		    	if (del_timer(&conntrack->timeout))
+		    		conntrack->timeout.function((unsigned long)
+		    					    conntrack);
+		    	return -NF_DROP;
+		}
+		conntrack->proto.tcp.last_index = index;
+		conntrack->proto.tcp.last_dir = dir;
+		conntrack->proto.tcp.last_seq = ntohl(th->seq);
+		conntrack->proto.tcp.last_end = 
+		    segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th);
+		
+		WRITE_UNLOCK(&tcp_lock);
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_tcp: invalid packet ignored ");
+		return NF_ACCEPT;
+	case TCP_CONNTRACK_MAX:
+		/* Invalid packet */
+		DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
+		       dir, get_conntrack_index(th),
+		       old_state);
+		WRITE_UNLOCK(&tcp_lock);
+		if (LOG_INVALID(IPPROTO_TCP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_tcp: invalid state ");
+		return -NF_ACCEPT;
+	case TCP_CONNTRACK_SYN_SENT:
+		if (old_state < TCP_CONNTRACK_TIME_WAIT)
+			break;
+		if ((conntrack->proto.tcp.seen[dir].flags &
+		         IP_CT_TCP_FLAG_CLOSE_INIT)
+		    || after(ntohl(th->seq),
+		    	     conntrack->proto.tcp.seen[dir].td_end)) {	
+		    	/* Attempt to reopen a closed connection.
+		    	* Delete this connection and look up again. */
+		    	WRITE_UNLOCK(&tcp_lock);
+		    	if (del_timer(&conntrack->timeout))
+		    		conntrack->timeout.function((unsigned long)
+		    					    conntrack);
+		    	return -NF_REPEAT;
+		} else {
+			WRITE_UNLOCK(&tcp_lock);
+			if (LOG_INVALID(IPPROTO_TCP))
+				nf_log_packet(PF_INET, 0, skb, NULL, NULL,
+				              "ip_ct_tcp: invalid SYN");
+			return -NF_ACCEPT;
+		}
+	case TCP_CONNTRACK_CLOSE:
+		if (index == TCP_RST_SET
+		    && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)
+		    && conntrack->proto.tcp.last_index == TCP_SYN_SET
+		    && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) {
+			/* RST sent to invalid SYN we had let trough
+			 * SYN was in window then, tear down connection.
+			 * We skip window checking, because packet might ACK
+			 * segments we ignored in the SYN. */
+			goto in_window;
+		}
+		/* Just fall trough */
+	default:
+		/* Keep compilers happy. */
+		break;
+	}
+
+	if (!tcp_in_window(&conntrack->proto.tcp, dir, index, 
+			   skb, iph, th)) {
+		WRITE_UNLOCK(&tcp_lock);
+		return -NF_ACCEPT;
+	}
+    in_window:
+	/* From now on we have got in-window packets */	
+	conntrack->proto.tcp.last_index = index;
+
+	DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu "
+	       "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
+		NIPQUAD(iph->saddr), ntohs(th->source),
+		NIPQUAD(iph->daddr), ntohs(th->dest),
+		(th->syn ? 1 : 0), (th->ack ? 1 : 0),
+		(th->fin ? 1 : 0), (th->rst ? 1 : 0),
+		old_state, new_state);
+
+	conntrack->proto.tcp.state = new_state;
+	if (old_state != new_state 
+	    && (new_state == TCP_CONNTRACK_FIN_WAIT
+	    	|| new_state == TCP_CONNTRACK_CLOSE))
+		conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+	timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans
+		  && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans
+		  ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
+	WRITE_UNLOCK(&tcp_lock);
+
+	if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+		/* If only reply is a RST, we can consider ourselves not to
+		   have an established connection: this is a fairly common
+		   problem case, so we can delete the conntrack
+		   immediately.  --RR */
+		if (th->rst) {
+			if (del_timer(&conntrack->timeout))
+				conntrack->timeout.function((unsigned long)
+							    conntrack);
+			return NF_ACCEPT;
+		}
+	} else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status)
+		   && (old_state == TCP_CONNTRACK_SYN_RECV
+		       || old_state == TCP_CONNTRACK_ESTABLISHED)
+		   && new_state == TCP_CONNTRACK_ESTABLISHED) {
+		/* Set ASSURED if we see see valid ack in ESTABLISHED 
+		   after SYN_RECV or a valid answer for a picked up 
+		   connection. */
+			set_bit(IPS_ASSURED_BIT, &conntrack->status);
+	}
+	ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
+
+	return NF_ACCEPT;
+}
+ 
+/* Called when a new connection for this protocol found. */
+static int tcp_new(struct ip_conntrack *conntrack,
+		   const struct sk_buff *skb)
+{
+	enum tcp_conntrack new_state;
+	struct iphdr *iph = skb->nh.iph;
+	struct tcphdr *th, _tcph;
+#ifdef DEBUGP_VARS
+	struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0];
+	struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1];
+#endif
+
+	th = skb_header_pointer(skb, iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	BUG_ON(th == NULL);
+	
+	/* Don't need lock here: this conntrack not in circulation yet */
+	new_state
+		= tcp_conntracks[0][get_conntrack_index(th)]
+		[TCP_CONNTRACK_NONE];
+
+	/* Invalid: delete conntrack */
+	if (new_state >= TCP_CONNTRACK_MAX) {
+		DEBUGP("ip_ct_tcp: invalid new deleting.\n");
+		return 0;
+	}
+
+	if (new_state == TCP_CONNTRACK_SYN_SENT) {
+		/* SYN packet */
+		conntrack->proto.tcp.seen[0].td_end =
+			segment_seq_plus_len(ntohl(th->seq), skb->len,
+					     iph, th);
+		conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+		if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+			conntrack->proto.tcp.seen[0].td_maxwin = 1;
+		conntrack->proto.tcp.seen[0].td_maxend =
+			conntrack->proto.tcp.seen[0].td_end;
+
+		tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]);
+		conntrack->proto.tcp.seen[1].flags = 0;
+		conntrack->proto.tcp.seen[0].loose = 
+		conntrack->proto.tcp.seen[1].loose = 0;
+	} else if (ip_ct_tcp_loose == 0) {
+		/* Don't try to pick up connections. */
+		return 0;
+	} else {
+		/*
+		 * We are in the middle of a connection,
+		 * its history is lost for us.
+		 * Let's try to use the data from the packet.
+		 */
+		conntrack->proto.tcp.seen[0].td_end =
+			segment_seq_plus_len(ntohl(th->seq), skb->len,
+					     iph, th);
+		conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
+		if (conntrack->proto.tcp.seen[0].td_maxwin == 0)
+			conntrack->proto.tcp.seen[0].td_maxwin = 1;
+		conntrack->proto.tcp.seen[0].td_maxend =
+			conntrack->proto.tcp.seen[0].td_end + 
+			conntrack->proto.tcp.seen[0].td_maxwin;
+		conntrack->proto.tcp.seen[0].td_scale = 0;
+
+		/* We assume SACK. Should we assume window scaling too? */
+		conntrack->proto.tcp.seen[0].flags =
+		conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM;
+		conntrack->proto.tcp.seen[0].loose = 
+		conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose;
+	}
+    
+	conntrack->proto.tcp.seen[1].td_end = 0;
+	conntrack->proto.tcp.seen[1].td_maxend = 0;
+	conntrack->proto.tcp.seen[1].td_maxwin = 1;
+	conntrack->proto.tcp.seen[1].td_scale = 0;      
+
+	/* tcp_packet will set them */
+	conntrack->proto.tcp.state = TCP_CONNTRACK_NONE;
+	conntrack->proto.tcp.last_index = TCP_NONE_SET;
+	 
+	DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
+	       "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
+		sender->td_end, sender->td_maxend, sender->td_maxwin,
+		sender->td_scale, 
+		receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
+		receiver->td_scale);
+	return 1;
+}
+  
+struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
+{
+	.proto 			= IPPROTO_TCP,
+	.name 			= "tcp",
+	.pkt_to_tuple 		= tcp_pkt_to_tuple,
+	.invert_tuple 		= tcp_invert_tuple,
+	.print_tuple 		= tcp_print_tuple,
+	.print_conntrack 	= tcp_print_conntrack,
+	.packet 		= tcp_packet,
+	.new 			= tcp_new,
+	.error			= tcp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
new file mode 100644
index 000000000000..5bc28a224623
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -0,0 +1,146 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/netfilter.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <linux/seq_file.h>
+#include <net/checksum.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+
+unsigned long ip_ct_udp_timeout = 30*HZ;
+unsigned long ip_ct_udp_timeout_stream = 180*HZ;
+
+static int udp_pkt_to_tuple(const struct sk_buff *skb,
+			     unsigned int dataoff,
+			     struct ip_conntrack_tuple *tuple)
+{
+	struct udphdr _hdr, *hp;
+
+	/* Actually only need first 8 bytes. */
+	hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr);
+	if (hp == NULL)
+		return 0;
+
+	tuple->src.u.udp.port = hp->source;
+	tuple->dst.u.udp.port = hp->dest;
+
+	return 1;
+}
+
+static int udp_invert_tuple(struct ip_conntrack_tuple *tuple,
+			    const struct ip_conntrack_tuple *orig)
+{
+	tuple->src.u.udp.port = orig->dst.u.udp.port;
+	tuple->dst.u.udp.port = orig->src.u.udp.port;
+	return 1;
+}
+
+/* Print out the per-protocol part of the tuple. */
+static int udp_print_tuple(struct seq_file *s,
+			   const struct ip_conntrack_tuple *tuple)
+{
+	return seq_printf(s, "sport=%hu dport=%hu ",
+			  ntohs(tuple->src.u.udp.port),
+			  ntohs(tuple->dst.u.udp.port));
+}
+
+/* Print out the private part of the conntrack. */
+static int udp_print_conntrack(struct seq_file *s,
+			       const struct ip_conntrack *conntrack)
+{
+	return 0;
+}
+
+/* Returns verdict for packet, and may modify conntracktype */
+static int udp_packet(struct ip_conntrack *conntrack,
+		      const struct sk_buff *skb,
+		      enum ip_conntrack_info ctinfo)
+{
+	/* If we've seen traffic both ways, this is some kind of UDP
+	   stream.  Extend timeout. */
+	if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
+		ip_ct_refresh_acct(conntrack, ctinfo, skb, 
+				   ip_ct_udp_timeout_stream);
+		/* Also, more likely to be important, and not a probe */
+		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+	} else
+		ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
+
+	return NF_ACCEPT;
+}
+
+/* Called when a new connection for this protocol found. */
+static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb)
+{
+	return 1;
+}
+
+static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo,
+		     unsigned int hooknum)
+{
+	struct iphdr *iph = skb->nh.iph;
+	unsigned int udplen = skb->len - iph->ihl * 4;
+	struct udphdr _hdr, *hdr;
+
+	/* Header is too small? */
+	hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr);
+	if (hdr == NULL) {
+		if (LOG_INVALID(IPPROTO_UDP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_udp: short packet ");
+		return -NF_ACCEPT;
+	}
+	
+	/* Truncated/malformed packets */
+	if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) {
+		if (LOG_INVALID(IPPROTO_UDP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_udp: truncated/malformed packet ");
+		return -NF_ACCEPT;
+	}
+	
+	/* Packet with no checksum */
+	if (!hdr->check)
+		return NF_ACCEPT;
+
+	/* Checksum invalid? Ignore.
+	 * We skip checking packets on the outgoing path
+	 * because the semantic of CHECKSUM_HW is different there 
+	 * and moreover root might send raw packets.
+	 * FIXME: Source route IP option packets --RR */
+	if (hooknum == NF_IP_PRE_ROUTING
+	    && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP,
+			         skb->ip_summed == CHECKSUM_HW ? skb->csum
+			      	 : skb_checksum(skb, iph->ihl*4, udplen, 0))) {
+		if (LOG_INVALID(IPPROTO_UDP))
+			nf_log_packet(PF_INET, 0, skb, NULL, NULL, 
+				  "ip_ct_udp: bad UDP checksum ");
+		return -NF_ACCEPT;
+	}
+	
+	return NF_ACCEPT;
+}
+
+struct ip_conntrack_protocol ip_conntrack_protocol_udp =
+{
+	.proto 			= IPPROTO_UDP,
+	.name			= "udp",
+	.pkt_to_tuple		= udp_pkt_to_tuple,
+	.invert_tuple		= udp_invert_tuple,
+	.print_tuple		= udp_print_tuple,
+	.print_conntrack	= udp_print_conntrack,
+	.packet			= udp_packet,
+	.new			= udp_new,
+	.error			= udp_error,
+};
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
new file mode 100644
index 000000000000..80a7bde2a57a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -0,0 +1,961 @@
+/* This file contains all the functions required for the standalone
+   ip_conntrack module.
+
+   These are not required by the compatibility layer.
+*/
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+#include <net/checksum.h>
+#include <net/ip.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_LICENSE("GPL");
+
+extern atomic_t ip_conntrack_count;
+DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
+
+static int kill_proto(struct ip_conntrack *i, void *data)
+{
+	return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == 
+			*((u_int8_t *) data));
+}
+
+#ifdef CONFIG_PROC_FS
+static int
+print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple,
+	    struct ip_conntrack_protocol *proto)
+{
+	seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ",
+		   NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip));
+	return proto->print_tuple(s, tuple);
+}
+
+#ifdef CONFIG_IP_NF_CT_ACCT
+static unsigned int
+seq_print_counters(struct seq_file *s,
+		   const struct ip_conntrack_counter *counter)
+{
+	return seq_printf(s, "packets=%llu bytes=%llu ",
+			  (unsigned long long)counter->packets,
+			  (unsigned long long)counter->bytes);
+}
+#else
+#define seq_print_counters(x, y)	0
+#endif
+
+struct ct_iter_state {
+	unsigned int bucket;
+};
+
+static struct list_head *ct_get_first(struct seq_file *seq)
+{
+	struct ct_iter_state *st = seq->private;
+
+	for (st->bucket = 0;
+	     st->bucket < ip_conntrack_htable_size;
+	     st->bucket++) {
+		if (!list_empty(&ip_conntrack_hash[st->bucket]))
+			return ip_conntrack_hash[st->bucket].next;
+	}
+	return NULL;
+}
+
+static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head)
+{
+	struct ct_iter_state *st = seq->private;
+
+	head = head->next;
+	while (head == &ip_conntrack_hash[st->bucket]) {
+		if (++st->bucket >= ip_conntrack_htable_size)
+			return NULL;
+		head = ip_conntrack_hash[st->bucket].next;
+	}
+	return head;
+}
+
+static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct list_head *head = ct_get_first(seq);
+
+	if (head)
+		while (pos && (head = ct_get_next(seq, head)))
+			pos--;
+	return pos ? NULL : head;
+}
+
+static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	READ_LOCK(&ip_conntrack_lock);
+	return ct_get_idx(seq, *pos);
+}
+
+static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return ct_get_next(s, v);
+}
+  
+static void ct_seq_stop(struct seq_file *s, void *v)
+{
+	READ_UNLOCK(&ip_conntrack_lock);
+}
+ 
+static int ct_seq_show(struct seq_file *s, void *v)
+{
+	const struct ip_conntrack_tuple_hash *hash = v;
+	const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash);
+	struct ip_conntrack_protocol *proto;
+
+	MUST_BE_READ_LOCKED(&ip_conntrack_lock);
+	IP_NF_ASSERT(conntrack);
+
+	/* we only want to print DIR_ORIGINAL */
+	if (DIRECTION(hash))
+		return 0;
+
+	proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+			       .tuple.dst.protonum);
+	IP_NF_ASSERT(proto);
+
+	if (seq_printf(s, "%-8s %u %ld ",
+		      proto->name,
+		      conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum,
+		      timer_pending(&conntrack->timeout)
+		      ? (long)(conntrack->timeout.expires - jiffies)/HZ
+		      : 0) != 0)
+		return -ENOSPC;
+
+	if (proto->print_conntrack(s, conntrack))
+		return -ENOSPC;
+  
+	if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+			proto))
+		return -ENOSPC;
+
+ 	if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL]))
+		return -ENOSPC;
+
+	if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)))
+		if (seq_printf(s, "[UNREPLIED] "))
+			return -ENOSPC;
+
+	if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple,
+			proto))
+		return -ENOSPC;
+
+ 	if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY]))
+		return -ENOSPC;
+
+	if (test_bit(IPS_ASSURED_BIT, &conntrack->status))
+		if (seq_printf(s, "[ASSURED] "))
+			return -ENOSPC;
+
+#if defined(CONFIG_IP_NF_CONNTRACK_MARK)
+	if (seq_printf(s, "mark=%lu ", conntrack->mark))
+		return -ENOSPC;
+#endif
+
+	if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use)))
+		return -ENOSPC;
+
+	return 0;
+}
+
+static struct seq_operations ct_seq_ops = {
+	.start = ct_seq_start,
+	.next  = ct_seq_next,
+	.stop  = ct_seq_stop,
+	.show  = ct_seq_show
+};
+  
+static int ct_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	struct ct_iter_state *st;
+	int ret;
+
+	st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL);
+	if (st == NULL)
+		return -ENOMEM;
+	ret = seq_open(file, &ct_seq_ops);
+	if (ret)
+		goto out_free;
+	seq          = file->private_data;
+	seq->private = st;
+	memset(st, 0, sizeof(struct ct_iter_state));
+	return ret;
+out_free:
+	kfree(st);
+	return ret;
+}
+
+static struct file_operations ct_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = ct_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+  
+/* expects */
+static void *exp_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct list_head *e = &ip_conntrack_expect_list;
+	loff_t i;
+
+	/* strange seq_file api calls stop even if we fail,
+	 * thus we need to grab lock since stop unlocks */
+	READ_LOCK(&ip_conntrack_lock);
+
+	if (list_empty(e))
+		return NULL;
+
+	for (i = 0; i <= *pos; i++) {
+		e = e->next;
+		if (e == &ip_conntrack_expect_list)
+			return NULL;
+	}
+	return e;
+}
+
+static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+ 	struct list_head *e = v;
+
+	e = e->next;
+
+	if (e == &ip_conntrack_expect_list)
+		return NULL;
+
+	return e;
+}
+
+static void exp_seq_stop(struct seq_file *s, void *v)
+{
+	READ_UNLOCK(&ip_conntrack_lock);
+}
+
+static int exp_seq_show(struct seq_file *s, void *v)
+{
+	struct ip_conntrack_expect *expect = v;
+
+	if (expect->timeout.function)
+		seq_printf(s, "%ld ", timer_pending(&expect->timeout)
+			   ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
+	else
+		seq_printf(s, "- ");
+
+	seq_printf(s, "proto=%u ", expect->tuple.dst.protonum);
+
+	print_tuple(s, &expect->tuple,
+		    ip_ct_find_proto(expect->tuple.dst.protonum));
+	return seq_putc(s, '\n');
+}
+
+static struct seq_operations exp_seq_ops = {
+	.start = exp_seq_start,
+	.next = exp_seq_next,
+	.stop = exp_seq_stop,
+	.show = exp_seq_show
+};
+
+static int exp_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &exp_seq_ops);
+}
+  
+static struct file_operations exp_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = exp_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return &per_cpu(ip_conntrack_stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	int cpu;
+
+	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return &per_cpu(ip_conntrack_stat, cpu);
+	}
+
+	return NULL;
+}
+
+static void ct_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+}
+
+static int ct_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	unsigned int nr_conntracks = atomic_read(&ip_conntrack_count);
+	struct ip_conntrack_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error  expect_new expect_create expect_delete\n");
+		return 0;
+	}
+
+	seq_printf(seq, "%08x  %08x %08x %08x %08x %08x %08x %08x "
+			"%08x %08x %08x %08x %08x  %08x %08x %08x \n",
+		   nr_conntracks,
+		   st->searched,
+		   st->found,
+		   st->new,
+		   st->invalid,
+		   st->ignore,
+		   st->delete,
+		   st->delete_list,
+		   st->insert,
+		   st->insert_failed,
+		   st->drop,
+		   st->early_drop,
+		   st->error,
+
+		   st->expect_new,
+		   st->expect_create,
+		   st->expect_delete
+		);
+	return 0;
+}
+
+static struct seq_operations ct_cpu_seq_ops = {
+	.start  = ct_cpu_seq_start,
+	.next   = ct_cpu_seq_next,
+	.stop   = ct_cpu_seq_stop,
+	.show   = ct_cpu_seq_show,
+};
+
+static int ct_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &ct_cpu_seq_ops);
+}
+
+static struct file_operations ct_cpu_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ct_cpu_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release_private,
+};
+#endif
+
+static unsigned int ip_confirm(unsigned int hooknum,
+			       struct sk_buff **pskb,
+			       const struct net_device *in,
+			       const struct net_device *out,
+			       int (*okfn)(struct sk_buff *))
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+
+	/* This is where we call the helper: as the packet goes out. */
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+	if (ct && ct->helper) {
+		unsigned int ret;
+		ret = ct->helper->help(pskb, ct, ctinfo);
+		if (ret != NF_ACCEPT)
+			return ret;
+	}
+
+	/* We've seen it coming out the other side: confirm it */
+	return ip_conntrack_confirm(pskb);
+}
+
+static unsigned int ip_conntrack_defrag(unsigned int hooknum,
+				        struct sk_buff **pskb,
+				        const struct net_device *in,
+				        const struct net_device *out,
+				        int (*okfn)(struct sk_buff *))
+{
+#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE)
+	/* Previously seen (loopback)?  Ignore.  Do this before
+           fragment check. */
+	if ((*pskb)->nfct)
+		return NF_ACCEPT;
+#endif
+
+	/* Gather fragments. */
+	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+		*pskb = ip_ct_gather_frags(*pskb,
+		                           hooknum == NF_IP_PRE_ROUTING ? 
+					   IP_DEFRAG_CONNTRACK_IN :
+					   IP_DEFRAG_CONNTRACK_OUT);
+		if (!*pskb)
+			return NF_STOLEN;
+	}
+	return NF_ACCEPT;
+}
+
+static unsigned int ip_refrag(unsigned int hooknum,
+			      struct sk_buff **pskb,
+			      const struct net_device *in,
+			      const struct net_device *out,
+			      int (*okfn)(struct sk_buff *))
+{
+	struct rtable *rt = (struct rtable *)(*pskb)->dst;
+
+	/* We've seen it coming out the other side: confirm */
+	if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT)
+		return NF_DROP;
+
+	/* Local packets are never produced too large for their
+	   interface.  We degfragment them at LOCAL_OUT, however,
+	   so we have to refragment them here. */
+	if ((*pskb)->len > dst_mtu(&rt->u.dst) &&
+	    !skb_shinfo(*pskb)->tso_size) {
+		/* No hook can be after us, so this should be OK. */
+		ip_fragment(*pskb, okfn);
+		return NF_STOLEN;
+	}
+	return NF_ACCEPT;
+}
+
+static unsigned int ip_conntrack_local(unsigned int hooknum,
+				       struct sk_buff **pskb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       int (*okfn)(struct sk_buff *))
+{
+	/* root is playing with raw sockets. */
+	if ((*pskb)->len < sizeof(struct iphdr)
+	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+		if (net_ratelimit())
+			printk("ipt_hook: happy cracking.\n");
+		return NF_ACCEPT;
+	}
+	return ip_conntrack_in(hooknum, pskb, in, out, okfn);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+   make it the first hook. */
+static struct nf_hook_ops ip_conntrack_defrag_ops = {
+	.hook		= ip_conntrack_defrag,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_PRE_ROUTING,
+	.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ip_conntrack_in_ops = {
+	.hook		= ip_conntrack_in,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_PRE_ROUTING,
+	.priority	= NF_IP_PRI_CONNTRACK,
+};
+
+static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = {
+	.hook		= ip_conntrack_defrag,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_LOCAL_OUT,
+	.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+};
+
+static struct nf_hook_ops ip_conntrack_local_out_ops = {
+	.hook		= ip_conntrack_local,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_LOCAL_OUT,
+	.priority	= NF_IP_PRI_CONNTRACK,
+};
+
+/* Refragmenter; last chance. */
+static struct nf_hook_ops ip_conntrack_out_ops = {
+	.hook		= ip_refrag,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_POST_ROUTING,
+	.priority	= NF_IP_PRI_LAST,
+};
+
+static struct nf_hook_ops ip_conntrack_local_in_ops = {
+	.hook		= ip_confirm,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_LOCAL_IN,
+	.priority	= NF_IP_PRI_LAST-1,
+};
+
+/* Sysctl support */
+
+#ifdef CONFIG_SYSCTL
+
+/* From ip_conntrack_core.c */
+extern int ip_conntrack_max;
+extern unsigned int ip_conntrack_htable_size;
+
+/* From ip_conntrack_proto_tcp.c */
+extern unsigned long ip_ct_tcp_timeout_syn_sent;
+extern unsigned long ip_ct_tcp_timeout_syn_recv;
+extern unsigned long ip_ct_tcp_timeout_established;
+extern unsigned long ip_ct_tcp_timeout_fin_wait;
+extern unsigned long ip_ct_tcp_timeout_close_wait;
+extern unsigned long ip_ct_tcp_timeout_last_ack;
+extern unsigned long ip_ct_tcp_timeout_time_wait;
+extern unsigned long ip_ct_tcp_timeout_close;
+extern unsigned long ip_ct_tcp_timeout_max_retrans;
+extern int ip_ct_tcp_loose;
+extern int ip_ct_tcp_be_liberal;
+extern int ip_ct_tcp_max_retrans;
+
+/* From ip_conntrack_proto_udp.c */
+extern unsigned long ip_ct_udp_timeout;
+extern unsigned long ip_ct_udp_timeout_stream;
+
+/* From ip_conntrack_proto_icmp.c */
+extern unsigned long ip_ct_icmp_timeout;
+
+/* From ip_conntrack_proto_icmp.c */
+extern unsigned long ip_ct_generic_timeout;
+
+/* Log invalid packets of a given protocol */
+static int log_invalid_proto_min = 0;
+static int log_invalid_proto_max = 255;
+
+static struct ctl_table_header *ip_ct_sysctl_header;
+
+static ctl_table ip_ct_sysctl_table[] = {
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_MAX,
+		.procname	= "ip_conntrack_max",
+		.data		= &ip_conntrack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_COUNT,
+		.procname	= "ip_conntrack_count",
+		.data		= &ip_conntrack_count,
+		.maxlen		= sizeof(int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_BUCKETS,
+		.procname	= "ip_conntrack_buckets",
+		.data		= &ip_conntrack_htable_size,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT,
+		.procname	= "ip_conntrack_tcp_timeout_syn_sent",
+		.data		= &ip_ct_tcp_timeout_syn_sent,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV,
+		.procname	= "ip_conntrack_tcp_timeout_syn_recv",
+		.data		= &ip_ct_tcp_timeout_syn_recv,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED,
+		.procname	= "ip_conntrack_tcp_timeout_established",
+		.data		= &ip_ct_tcp_timeout_established,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT,
+		.procname	= "ip_conntrack_tcp_timeout_fin_wait",
+		.data		= &ip_ct_tcp_timeout_fin_wait,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT,
+		.procname	= "ip_conntrack_tcp_timeout_close_wait",
+		.data		= &ip_ct_tcp_timeout_close_wait,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK,
+		.procname	= "ip_conntrack_tcp_timeout_last_ack",
+		.data		= &ip_ct_tcp_timeout_last_ack,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT,
+		.procname	= "ip_conntrack_tcp_timeout_time_wait",
+		.data		= &ip_ct_tcp_timeout_time_wait,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE,
+		.procname	= "ip_conntrack_tcp_timeout_close",
+		.data		= &ip_ct_tcp_timeout_close,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT,
+		.procname	= "ip_conntrack_udp_timeout",
+		.data		= &ip_ct_udp_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM,
+		.procname	= "ip_conntrack_udp_timeout_stream",
+		.data		= &ip_ct_udp_timeout_stream,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT,
+		.procname	= "ip_conntrack_icmp_timeout",
+		.data		= &ip_ct_icmp_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT,
+		.procname	= "ip_conntrack_generic_timeout",
+		.data		= &ip_ct_generic_timeout,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_LOG_INVALID,
+		.procname	= "ip_conntrack_log_invalid",
+		.data		= &ip_ct_log_invalid,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &log_invalid_proto_min,
+		.extra2		= &log_invalid_proto_max,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS,
+		.procname	= "ip_conntrack_tcp_timeout_max_retrans",
+		.data		= &ip_ct_tcp_timeout_max_retrans,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
+		.procname	= "ip_conntrack_tcp_loose",
+		.data		= &ip_ct_tcp_loose,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
+		.procname	= "ip_conntrack_tcp_be_liberal",
+		.data		= &ip_ct_tcp_be_liberal,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
+		.procname	= "ip_conntrack_tcp_max_retrans",
+		.data		= &ip_ct_tcp_max_retrans,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+#define NET_IP_CONNTRACK_MAX 2089
+
+static ctl_table ip_ct_netfilter_table[] = {
+	{
+		.ctl_name	= NET_IPV4_NETFILTER,
+		.procname	= "netfilter",
+		.mode		= 0555,
+		.child		= ip_ct_sysctl_table,
+	},
+	{
+		.ctl_name	= NET_IP_CONNTRACK_MAX,
+		.procname	= "ip_conntrack_max",
+		.data		= &ip_conntrack_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_ipv4_table[] = {
+	{
+		.ctl_name	= NET_IPV4,
+		.procname	= "ipv4",
+		.mode		= 0555,
+		.child		= ip_ct_netfilter_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ip_ct_net_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555, 
+		.child		= ip_ct_ipv4_table,
+	},
+	{ .ctl_name = 0 }
+};
+
+EXPORT_SYMBOL(ip_ct_log_invalid);
+#endif /* CONFIG_SYSCTL */
+
+static int init_or_cleanup(int init)
+{
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *proc, *proc_exp, *proc_stat;
+#endif
+	int ret = 0;
+
+	if (!init) goto cleanup;
+
+	ret = ip_conntrack_init();
+	if (ret < 0)
+		goto cleanup_nothing;
+
+#ifdef CONFIG_PROC_FS
+	ret = -ENOMEM;
+	proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops);
+	if (!proc) goto cleanup_init;
+
+	proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440,
+					&exp_file_ops);
+	if (!proc_exp) goto cleanup_proc;
+
+	proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat);
+	if (!proc_stat)
+		goto cleanup_proc_exp;
+
+	proc_stat->proc_fops = &ct_cpu_seq_fops;
+	proc_stat->owner = THIS_MODULE;
+#endif
+
+	ret = nf_register_hook(&ip_conntrack_defrag_ops);
+	if (ret < 0) {
+		printk("ip_conntrack: can't register pre-routing defrag hook.\n");
+		goto cleanup_proc_stat;
+	}
+	ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops);
+	if (ret < 0) {
+		printk("ip_conntrack: can't register local_out defrag hook.\n");
+		goto cleanup_defragops;
+	}
+	ret = nf_register_hook(&ip_conntrack_in_ops);
+	if (ret < 0) {
+		printk("ip_conntrack: can't register pre-routing hook.\n");
+		goto cleanup_defraglocalops;
+	}
+	ret = nf_register_hook(&ip_conntrack_local_out_ops);
+	if (ret < 0) {
+		printk("ip_conntrack: can't register local out hook.\n");
+		goto cleanup_inops;
+	}
+	ret = nf_register_hook(&ip_conntrack_out_ops);
+	if (ret < 0) {
+		printk("ip_conntrack: can't register post-routing hook.\n");
+		goto cleanup_inandlocalops;
+	}
+	ret = nf_register_hook(&ip_conntrack_local_in_ops);
+	if (ret < 0) {
+		printk("ip_conntrack: can't register local in hook.\n");
+		goto cleanup_inoutandlocalops;
+	}
+#ifdef CONFIG_SYSCTL
+	ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0);
+	if (ip_ct_sysctl_header == NULL) {
+		printk("ip_conntrack: can't register to sysctl.\n");
+		ret = -ENOMEM;
+		goto cleanup_localinops;
+	}
+#endif
+
+	return ret;
+
+ cleanup:
+#ifdef CONFIG_SYSCTL
+ 	unregister_sysctl_table(ip_ct_sysctl_header);
+ cleanup_localinops:
+#endif
+	nf_unregister_hook(&ip_conntrack_local_in_ops);
+ cleanup_inoutandlocalops:
+	nf_unregister_hook(&ip_conntrack_out_ops);
+ cleanup_inandlocalops:
+	nf_unregister_hook(&ip_conntrack_local_out_ops);
+ cleanup_inops:
+	nf_unregister_hook(&ip_conntrack_in_ops);
+ cleanup_defraglocalops:
+	nf_unregister_hook(&ip_conntrack_defrag_local_out_ops);
+ cleanup_defragops:
+	nf_unregister_hook(&ip_conntrack_defrag_ops);
+ cleanup_proc_stat:
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry("ip_conntrack", proc_net_stat);
+ cleanup_proc_exp:
+	proc_net_remove("ip_conntrack_expect");
+ cleanup_proc:
+	proc_net_remove("ip_conntrack");
+ cleanup_init:
+#endif /* CONFIG_PROC_FS */
+	ip_conntrack_cleanup();
+ cleanup_nothing:
+	return ret;
+}
+
+/* FIXME: Allow NULL functions and sub in pointers to generic for
+   them. --RR */
+int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto)
+{
+	int ret = 0;
+
+	WRITE_LOCK(&ip_conntrack_lock);
+	if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) {
+		ret = -EBUSY;
+		goto out;
+	}
+	ip_ct_protos[proto->proto] = proto;
+ out:
+	WRITE_UNLOCK(&ip_conntrack_lock);
+	return ret;
+}
+
+void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto)
+{
+	WRITE_LOCK(&ip_conntrack_lock);
+	ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol;
+	WRITE_UNLOCK(&ip_conntrack_lock);
+	
+	/* Somebody could be still looking at the proto in bh. */
+	synchronize_net();
+
+	/* Remove all contrack entries for this protocol */
+	ip_ct_iterate_cleanup(kill_proto, &proto->proto);
+}
+
+static int __init init(void)
+{
+	return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+/* Some modules need us, but don't depend directly on any symbol.
+   They should call this. */
+void need_ip_conntrack(void)
+{
+}
+
+EXPORT_SYMBOL(ip_conntrack_protocol_register);
+EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
+EXPORT_SYMBOL(ip_ct_get_tuple);
+EXPORT_SYMBOL(invert_tuplepr);
+EXPORT_SYMBOL(ip_conntrack_alter_reply);
+EXPORT_SYMBOL(ip_conntrack_destroyed);
+EXPORT_SYMBOL(need_ip_conntrack);
+EXPORT_SYMBOL(ip_conntrack_helper_register);
+EXPORT_SYMBOL(ip_conntrack_helper_unregister);
+EXPORT_SYMBOL(ip_ct_iterate_cleanup);
+EXPORT_SYMBOL(ip_ct_refresh_acct);
+EXPORT_SYMBOL(ip_ct_protos);
+EXPORT_SYMBOL(ip_ct_find_proto);
+EXPORT_SYMBOL(ip_conntrack_expect_alloc);
+EXPORT_SYMBOL(ip_conntrack_expect_free);
+EXPORT_SYMBOL(ip_conntrack_expect_related);
+EXPORT_SYMBOL(ip_conntrack_unexpect_related);
+EXPORT_SYMBOL(ip_conntrack_tuple_taken);
+EXPORT_SYMBOL(ip_ct_gather_frags);
+EXPORT_SYMBOL(ip_conntrack_htable_size);
+EXPORT_SYMBOL(ip_conntrack_lock);
+EXPORT_SYMBOL(ip_conntrack_hash);
+EXPORT_SYMBOL(ip_conntrack_untracked);
+EXPORT_SYMBOL_GPL(ip_conntrack_find_get);
+EXPORT_SYMBOL_GPL(ip_conntrack_put);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+EXPORT_SYMBOL(ip_conntrack_tcp_update);
+#endif
diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c
new file mode 100644
index 000000000000..992fac3e36ee
--- /dev/null
+++ b/net/ipv4/netfilter/ip_conntrack_tftp.c
@@ -0,0 +1,159 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Version: 0.0.7
+ *
+ * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
+ * 	- port to newnat API
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
+#include <linux/moduleparam.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("tftp connection tracking helper");
+MODULE_LICENSE("GPL");
+
+#define MAX_PORTS 8
+static int ports[MAX_PORTS];
+static int ports_c;
+module_param_array(ports, int, &ports_c, 0400);
+MODULE_PARM_DESC(ports, "port numbers of tftp servers");
+
+#if 0
+#define DEBUGP(format, args...) printk("%s:%s:" format, \
+                                       __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb,
+				 enum ip_conntrack_info ctinfo,
+				 struct ip_conntrack_expect *exp);
+EXPORT_SYMBOL_GPL(ip_nat_tftp_hook);
+
+static int tftp_help(struct sk_buff **pskb,
+		     struct ip_conntrack *ct,
+		     enum ip_conntrack_info ctinfo)
+{
+	struct tftphdr _tftph, *tfh;
+	struct ip_conntrack_expect *exp;
+	unsigned int ret = NF_ACCEPT;
+
+	tfh = skb_header_pointer(*pskb,
+				 (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr),
+				 sizeof(_tftph), &_tftph);
+	if (tfh == NULL)
+		return NF_ACCEPT;
+
+	switch (ntohs(tfh->opcode)) {
+	/* RRQ and WRQ works the same way */
+	case TFTP_OPCODE_READ:
+	case TFTP_OPCODE_WRITE:
+		DEBUGP("");
+		DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+		exp = ip_conntrack_expect_alloc();
+		if (exp == NULL)
+			return NF_DROP;
+
+		exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+		exp->mask.src.ip = 0xffffffff;
+		exp->mask.dst.ip = 0xffffffff;
+		exp->mask.dst.u.udp.port = 0xffff;
+		exp->mask.dst.protonum = 0xff;
+		exp->expectfn = NULL;
+		exp->master = ct;
+
+		DEBUGP("expect: ");
+		DUMP_TUPLE(&exp->tuple);
+		DUMP_TUPLE(&exp->mask);
+		if (ip_nat_tftp_hook)
+			ret = ip_nat_tftp_hook(pskb, ctinfo, exp);
+		else if (ip_conntrack_expect_related(exp) != 0) {
+			ip_conntrack_expect_free(exp);
+			ret = NF_DROP;
+		}
+		break;
+	case TFTP_OPCODE_DATA:
+	case TFTP_OPCODE_ACK:
+		DEBUGP("Data/ACK opcode\n");
+		break;
+	case TFTP_OPCODE_ERROR:
+		DEBUGP("Error opcode\n");
+		break;
+	default:
+		DEBUGP("Unknown opcode\n");
+	}
+	return NF_ACCEPT;
+}
+
+static struct ip_conntrack_helper tftp[MAX_PORTS];
+static char tftp_names[MAX_PORTS][10];
+
+static void fini(void)
+{
+	int i;
+
+	for (i = 0 ; i < ports_c; i++) {
+		DEBUGP("unregistering helper for port %d\n",
+			ports[i]);
+		ip_conntrack_helper_unregister(&tftp[i]);
+	} 
+}
+
+static int __init init(void)
+{
+	int i, ret;
+	char *tmpname;
+
+	if (ports_c == 0)
+		ports[ports_c++] = TFTP_PORT;
+
+	for (i = 0; i < ports_c; i++) {
+		/* Create helper structure */
+		memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper));
+
+		tftp[i].tuple.dst.protonum = IPPROTO_UDP;
+		tftp[i].tuple.src.u.udp.port = htons(ports[i]);
+		tftp[i].mask.dst.protonum = 0xFF;
+		tftp[i].mask.src.u.udp.port = 0xFFFF;
+		tftp[i].max_expected = 1;
+		tftp[i].timeout = 5 * 60; /* 5 minutes */
+		tftp[i].me = THIS_MODULE;
+		tftp[i].help = tftp_help;
+
+		tmpname = &tftp_names[i][0];
+		if (ports[i] == TFTP_PORT)
+			sprintf(tmpname, "tftp");
+		else
+			sprintf(tmpname, "tftp-%d", i);
+		tftp[i].name = tmpname;
+
+		DEBUGP("port #%d: %d\n", i, ports[i]);
+
+		ret=ip_conntrack_helper_register(&tftp[i]);
+		if (ret) {
+			printk("ERROR registering helper for port %d\n",
+				ports[i]);
+			fini();
+			return(ret);
+		}
+	}
+	return(0);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c
new file mode 100644
index 000000000000..da1f412583ed
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_amanda.c
@@ -0,0 +1,88 @@
+/* Amanda extension for TCP NAT alteration.
+ * (C) 2002 by Brian J. Murrell <netfilter@interlinx.bc.ca>
+ * based on a copy of HW's ip_nat_irc.c as well as other modules
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ *
+ *	Module load syntax:
+ * 	insmod ip_nat_amanda.o
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_amanda.h>
+
+
+MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>");
+MODULE_DESCRIPTION("Amanda NAT helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int help(struct sk_buff **pskb,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int matchoff,
+			 unsigned int matchlen,
+			 struct ip_conntrack_expect *exp)
+{
+	char buffer[sizeof("65535")];
+	u_int16_t port;
+	unsigned int ret;
+
+	/* Connection comes from client. */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = IP_CT_DIR_ORIGINAL;
+
+	/* When you see the packet, we need to NAT it the same as the
+	 * this one (ie. same IP: it will be TCP and master is UDP). */
+	exp->expectfn = ip_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		exp->tuple.dst.u.tcp.port = htons(port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (port == 0) {
+		ip_conntrack_expect_free(exp);
+		return NF_DROP;
+	}
+
+	sprintf(buffer, "%u", port);
+	ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo,
+				       matchoff, matchlen,
+				       buffer, strlen(buffer));
+	if (ret != NF_ACCEPT)
+		ip_conntrack_unexpect_related(exp);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_amanda_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_amanda_hook);
+	ip_nat_amanda_hook = help;
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
new file mode 100644
index 000000000000..162ceacfc29a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -0,0 +1,556 @@
+/* NAT for netfilter; shared with compatibility layer. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/vmalloc.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>  /* For tcp_prot in getorigdst */
+#include <linux/icmp.h>
+#include <linux/udp.h>
+#include <linux/jhash.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+DECLARE_RWLOCK(ip_nat_lock);
+
+/* Calculated at init based on memory size */
+static unsigned int ip_nat_htable_size;
+
+static struct list_head *bysource;
+struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO];
+
+
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct ip_conntrack_tuple *tuple)
+{
+	/* Original src, to ensure we map it consistently if poss. */
+	return jhash_3words(tuple->src.ip, tuple->src.u.all,
+			    tuple->dst.protonum, 0) % ip_nat_htable_size;
+}
+
+/* Noone using conntrack by the time this called. */
+static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn)
+{
+	if (!(conn->status & IPS_NAT_DONE_MASK))
+		return;
+
+	WRITE_LOCK(&ip_nat_lock);
+	list_del(&conn->nat.info.bysource);
+	WRITE_UNLOCK(&ip_nat_lock);
+}
+
+/* We do checksum mangling, so if they were wrong before they're still
+ * wrong.  Also works for incomplete packets (eg. ICMP dest
+ * unreachables.) */
+u_int16_t
+ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+	u_int32_t diffs[] = { oldvalinv, newval };
+	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+				      oldcheck^0xFFFF));
+}
+
+/* Is this tuple already taken? (not by us) */
+int
+ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
+		  const struct ip_conntrack *ignored_conntrack)
+{
+	/* Conntrack tracking doesn't keep track of outgoing tuples; only
+	   incoming ones.  NAT means they don't have a fixed mapping,
+	   so we invert the tuple and look for the incoming reply.
+
+	   We could keep a separate hash if this proves too slow. */
+	struct ip_conntrack_tuple reply;
+
+	invert_tuplepr(&reply, tuple);
+	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
+}
+
+/* If we source map this tuple so reply looks like reply_tuple, will
+ * that meet the constraints of range. */
+static int
+in_range(const struct ip_conntrack_tuple *tuple,
+	 const struct ip_nat_range *range)
+{
+	struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum);
+
+	/* If we are supposed to map IPs, then we must be in the
+	   range specified, otherwise let this drag us onto a new src IP. */
+	if (range->flags & IP_NAT_RANGE_MAP_IPS) {
+		if (ntohl(tuple->src.ip) < ntohl(range->min_ip)
+		    || ntohl(tuple->src.ip) > ntohl(range->max_ip))
+			return 0;
+	}
+
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+	    || proto->in_range(tuple, IP_NAT_MANIP_SRC,
+			       &range->min, &range->max))
+		return 1;
+
+	return 0;
+}
+
+static inline int
+same_src(const struct ip_conntrack *ct,
+	 const struct ip_conntrack_tuple *tuple)
+{
+	return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum
+		== tuple->dst.protonum
+		&& ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip
+		== tuple->src.ip
+		&& ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all
+		== tuple->src.u.all);
+}
+
+/* Only called for SRC manip */
+static int
+find_appropriate_src(const struct ip_conntrack_tuple *tuple,
+		     struct ip_conntrack_tuple *result,
+		     const struct ip_nat_range *range)
+{
+	unsigned int h = hash_by_src(tuple);
+	struct ip_conntrack *ct;
+
+	READ_LOCK(&ip_nat_lock);
+	list_for_each_entry(ct, &bysource[h], nat.info.bysource) {
+		if (same_src(ct, tuple)) {
+			/* Copy source part from reply tuple. */
+			invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
+
+			if (in_range(result, range)) {
+				READ_UNLOCK(&ip_nat_lock);
+				return 1;
+			}
+		}
+	}
+	READ_UNLOCK(&ip_nat_lock);
+	return 0;
+}
+
+/* For [FUTURE] fragmentation handling, we want the least-used
+   src-ip/dst-ip/proto triple.  Fairness doesn't come into it.  Thus
+   if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports
+   1-65535, we don't do pro-rata allocation based on ports; we choose
+   the ip with the lowest src-ip/dst-ip/proto usage.
+*/
+static void
+find_best_ips_proto(struct ip_conntrack_tuple *tuple,
+		    const struct ip_nat_range *range,
+		    const struct ip_conntrack *conntrack,
+		    enum ip_nat_manip_type maniptype)
+{
+	u_int32_t *var_ipp;
+	/* Host order */
+	u_int32_t minip, maxip, j;
+
+	/* No IP mapping?  Do nothing. */
+	if (!(range->flags & IP_NAT_RANGE_MAP_IPS))
+		return;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		var_ipp = &tuple->src.ip;
+	else
+		var_ipp = &tuple->dst.ip;
+
+	/* Fast path: only one choice. */
+	if (range->min_ip == range->max_ip) {
+		*var_ipp = range->min_ip;
+		return;
+	}
+
+	/* Hashing source and destination IPs gives a fairly even
+	 * spread in practice (if there are a small number of IPs
+	 * involved, there usually aren't that many connections
+	 * anyway).  The consistency means that servers see the same
+	 * client coming from the same IP (some Internet Banking sites
+	 * like this), even across reboots. */
+	minip = ntohl(range->min_ip);
+	maxip = ntohl(range->max_ip);
+	j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0);
+	*var_ipp = htonl(minip + j % (maxip - minip + 1));
+}
+
+/* Manipulate the tuple into the range given.  For NF_IP_POST_ROUTING,
+ * we change the source to map into the range.  For NF_IP_PRE_ROUTING
+ * and NF_IP_LOCAL_OUT, we change the destination to map into the
+ * range.  It might not be possible to get a unique tuple, but we try.
+ * At worst (or if we race), we will end up with a final duplicate in
+ * __ip_conntrack_confirm and drop the packet. */
+static void
+get_unique_tuple(struct ip_conntrack_tuple *tuple,
+		 const struct ip_conntrack_tuple *orig_tuple,
+		 const struct ip_nat_range *range,
+		 struct ip_conntrack *conntrack,
+		 enum ip_nat_manip_type maniptype)
+{
+	struct ip_nat_protocol *proto
+		= ip_nat_find_proto(orig_tuple->dst.protonum);
+
+	/* 1) If this srcip/proto/src-proto-part is currently mapped,
+	   and that same mapping gives a unique tuple within the given
+	   range, use that.
+
+	   This is only required for source (ie. NAT/masq) mappings.
+	   So far, we don't do local source mappings, so multiple
+	   manips not an issue.  */
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		if (find_appropriate_src(orig_tuple, tuple, range)) {
+			DEBUGP("get_unique_tuple: Found current src map\n");
+			if (!ip_nat_used_tuple(tuple, conntrack))
+				return;
+		}
+	}
+
+	/* 2) Select the least-used IP/proto combination in the given
+	   range. */
+	*tuple = *orig_tuple;
+	find_best_ips_proto(tuple, range, conntrack, maniptype);
+
+	/* 3) The per-protocol part of the manip is made to map into
+	   the range to make a unique tuple. */
+
+	/* Only bother mapping if it's not already in range and unique */
+	if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)
+	     || proto->in_range(tuple, maniptype, &range->min, &range->max))
+	    && !ip_nat_used_tuple(tuple, conntrack))
+		return;
+
+	/* Last change: get protocol to try to obtain unique tuple. */
+	proto->unique_tuple(tuple, range, maniptype, conntrack);
+}
+
+unsigned int
+ip_nat_setup_info(struct ip_conntrack *conntrack,
+		  const struct ip_nat_range *range,
+		  unsigned int hooknum)
+{
+	struct ip_conntrack_tuple curr_tuple, new_tuple;
+	struct ip_nat_info *info = &conntrack->nat.info;
+	int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK);
+	enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+		     || hooknum == NF_IP_POST_ROUTING
+		     || hooknum == NF_IP_LOCAL_IN
+		     || hooknum == NF_IP_LOCAL_OUT);
+	BUG_ON(ip_nat_initialized(conntrack, maniptype));
+
+	/* What we've got will look like inverse of reply. Normally
+	   this is what is in the conntrack, except for prior
+	   manipulations (future optimization: if num_manips == 0,
+	   orig_tp =
+	   conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */
+	invert_tuplepr(&curr_tuple,
+		       &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple);
+
+	get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype);
+
+	if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) {
+		struct ip_conntrack_tuple reply;
+
+		/* Alter conntrack table so will recognize replies. */
+		invert_tuplepr(&reply, &new_tuple);
+		ip_conntrack_alter_reply(conntrack, &reply);
+
+		/* Non-atomic: we own this at the moment. */
+		if (maniptype == IP_NAT_MANIP_SRC)
+			conntrack->status |= IPS_SRC_NAT;
+		else
+			conntrack->status |= IPS_DST_NAT;
+	}
+
+	/* Place in source hash if this is the first time. */
+	if (have_to_hash) {
+		unsigned int srchash
+			= hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL]
+				      .tuple);
+		WRITE_LOCK(&ip_nat_lock);
+		list_add(&info->bysource, &bysource[srchash]);
+		WRITE_UNLOCK(&ip_nat_lock);
+	}
+
+	/* It's done. */
+	if (maniptype == IP_NAT_MANIP_DST)
+		set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status);
+	else
+		set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status);
+
+	return NF_ACCEPT;
+}
+
+/* Returns true if succeeded. */
+static int
+manip_pkt(u_int16_t proto,
+	  struct sk_buff **pskb,
+	  unsigned int iphdroff,
+	  const struct ip_conntrack_tuple *target,
+	  enum ip_nat_manip_type maniptype)
+{
+	struct iphdr *iph;
+
+	(*pskb)->nfcache |= NFC_ALTERED;
+	if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph)))
+		return 0;
+
+	iph = (void *)(*pskb)->data + iphdroff;
+
+	/* Manipulate protcol part. */
+	if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff,
+	                                         target, maniptype))
+		return 0;
+
+	iph = (void *)(*pskb)->data + iphdroff;
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip,
+						iph->check);
+		iph->saddr = target->src.ip;
+	} else {
+		iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip,
+						iph->check);
+		iph->daddr = target->dst.ip;
+	}
+	return 1;
+}
+
+/* Do packet manipulations according to ip_nat_setup_info. */
+unsigned int nat_packet(struct ip_conntrack *ct,
+			enum ip_conntrack_info ctinfo,
+			unsigned int hooknum,
+			struct sk_buff **pskb)
+{
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+	unsigned long statusbit;
+	enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum);
+
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status)
+	    && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) {
+		DEBUGP("ip_nat_core: adjusting sequence number\n");
+		/* future: put this in a l4-proto specific function,
+		 * and call this function here. */
+		if (!ip_nat_seq_adjust(pskb, ct, ctinfo))
+			return NF_DROP;
+	}
+
+	if (mtype == IP_NAT_MANIP_SRC)
+		statusbit = IPS_SRC_NAT;
+	else
+		statusbit = IPS_DST_NAT;
+
+	/* Invert if this is reply dir. */
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+
+	/* Non-atomic: these bits don't change. */
+	if (ct->status & statusbit) {
+		struct ip_conntrack_tuple target;
+
+		/* We are aiming to look like inverse of other direction. */
+		invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+
+		if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype))
+			return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+/* Dir is direction ICMP is coming from (opposite to packet it contains) */
+int icmp_reply_translation(struct sk_buff **pskb,
+			   struct ip_conntrack *ct,
+			   enum ip_nat_manip_type manip,
+			   enum ip_conntrack_dir dir)
+{
+	struct {
+		struct icmphdr icmp;
+		struct iphdr ip;
+	} *inside;
+	struct ip_conntrack_tuple inner, target;
+	int hdrlen = (*pskb)->nh.iph->ihl * 4;
+
+	if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside)))
+		return 0;
+
+	inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+
+	/* We're actually going to mangle it beyond trivial checksum
+	   adjustment, so make sure the current checksum is correct. */
+	if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) {
+		hdrlen = (*pskb)->nh.iph->ihl * 4;
+		if ((u16)csum_fold(skb_checksum(*pskb, hdrlen,
+						(*pskb)->len - hdrlen, 0)))
+			return 0;
+	}
+
+	/* Must be RELATED */
+	IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED ||
+		     (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY);
+
+	/* Redirects on non-null nats must be dropped, else they'll
+           start talking to each other without our translation, and be
+           confused... --RR */
+	if (inside->icmp.type == ICMP_REDIRECT) {
+		/* If NAT isn't finished, assume it and drop. */
+		if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK)
+			return 0;
+
+		if (ct->status & IPS_NAT_MASK)
+			return 0;
+	}
+
+	DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n",
+	       *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
+
+	if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 +
+	                     sizeof(struct icmphdr) + inside->ip.ihl*4,
+	                     &inner, ip_ct_find_proto(inside->ip.protocol)))
+		return 0;
+
+	/* Change inner back to look like incoming packet.  We do the
+	   opposite manip on this hook to normal, because it might not
+	   pass all hooks (locally-generated ICMP).  Consider incoming
+	   packet: PREROUTING (DST manip), routing produces ICMP, goes
+	   through POSTROUTING (which must correct the DST manip). */
+	if (!manip_pkt(inside->ip.protocol, pskb,
+		       (*pskb)->nh.iph->ihl*4
+		       + sizeof(inside->icmp),
+		       &ct->tuplehash[!dir].tuple,
+		       !manip))
+		return 0;
+
+	/* Reloading "inside" here since manip_pkt inner. */
+	inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+	inside->icmp.checksum = 0;
+	inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen,
+						       (*pskb)->len - hdrlen,
+						       0));
+
+	/* Change outer to look the reply to an incoming packet
+	 * (proto 0 means don't invert per-proto part). */
+
+	/* Obviously, we need to NAT destination IP, but source IP
+	   should be NAT'ed only if it is from a NAT'd host.
+
+	   Explanation: some people use NAT for anonymizing.  Also,
+	   CERT recommends dropping all packets from private IP
+	   addresses (although ICMP errors from internal links with
+	   such addresses are not too uncommon, as Alan Cox points
+	   out) */
+	if (manip != IP_NAT_MANIP_SRC
+	    || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) {
+		invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+		if (!manip_pkt(0, pskb, 0, &target, manip))
+			return 0;
+	}
+
+	return 1;
+}
+
+/* Protocol registration. */
+int ip_nat_protocol_register(struct ip_nat_protocol *proto)
+{
+	int ret = 0;
+
+	WRITE_LOCK(&ip_nat_lock);
+	if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) {
+		ret = -EBUSY;
+		goto out;
+	}
+	ip_nat_protos[proto->protonum] = proto;
+ out:
+	WRITE_UNLOCK(&ip_nat_lock);
+	return ret;
+}
+
+/* Noone stores the protocol anywhere; simply delete it. */
+void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
+{
+	WRITE_LOCK(&ip_nat_lock);
+	ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol;
+	WRITE_UNLOCK(&ip_nat_lock);
+
+	/* Someone could be still looking at the proto in a bh. */
+	synchronize_net();
+}
+
+int __init ip_nat_init(void)
+{
+	size_t i;
+
+	/* Leave them the same for the moment. */
+	ip_nat_htable_size = ip_conntrack_htable_size;
+
+	/* One vmalloc for both hash tables */
+	bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size);
+	if (!bysource)
+		return -ENOMEM;
+
+	/* Sew in builtin protocols. */
+	WRITE_LOCK(&ip_nat_lock);
+	for (i = 0; i < MAX_IP_NAT_PROTO; i++)
+		ip_nat_protos[i] = &ip_nat_unknown_protocol;
+	ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp;
+	ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp;
+	ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp;
+	WRITE_UNLOCK(&ip_nat_lock);
+
+	for (i = 0; i < ip_nat_htable_size; i++) {
+		INIT_LIST_HEAD(&bysource[i]);
+	}
+
+	/* FIXME: Man, this is a hack.  <SIGH> */
+	IP_NF_ASSERT(ip_conntrack_destroyed == NULL);
+	ip_conntrack_destroyed = &ip_nat_cleanup_conntrack;
+
+	/* Initialize fake conntrack so that NAT will skip it */
+	ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK;
+	return 0;
+}
+
+/* Clear NAT section of all conntracks, in case we're loaded again. */
+static int clean_nat(struct ip_conntrack *i, void *data)
+{
+	memset(&i->nat, 0, sizeof(i->nat));
+	i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST);
+	return 0;
+}
+
+/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
+void ip_nat_cleanup(void)
+{
+	ip_ct_iterate_cleanup(&clean_nat, NULL);
+	ip_conntrack_destroyed = NULL;
+	vfree(bysource);
+}
diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c
new file mode 100644
index 000000000000..c6000e794ad6
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_ftp.c
@@ -0,0 +1,183 @@
+/* FTP extension for TCP NAT alteration. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/moduleparam.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_ftp.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("ftp NAT helper");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* FIXME: Time out? --RR */
+
+static int
+mangle_rfc959_packet(struct sk_buff **pskb,
+		     u_int32_t newip,
+		     u_int16_t port,
+		     unsigned int matchoff,
+		     unsigned int matchlen,
+		     struct ip_conntrack *ct,
+		     enum ip_conntrack_info ctinfo,
+		     u32 *seq)
+{
+	char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
+
+	sprintf(buffer, "%u,%u,%u,%u,%u,%u",
+		NIPQUAD(newip), port>>8, port&0xFF);
+
+	DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+
+	*seq += strlen(buffer) - matchlen;
+	return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, 
+					matchlen, buffer, strlen(buffer));
+}
+
+/* |1|132.235.1.2|6275| */
+static int
+mangle_eprt_packet(struct sk_buff **pskb,
+		   u_int32_t newip,
+		   u_int16_t port,
+		   unsigned int matchoff,
+		   unsigned int matchlen,
+		   struct ip_conntrack *ct,
+		   enum ip_conntrack_info ctinfo,
+		   u32 *seq)
+{
+	char buffer[sizeof("|1|255.255.255.255|65535|")];
+
+	sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
+
+	DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+
+	*seq += strlen(buffer) - matchlen;
+	return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, 
+					matchlen, buffer, strlen(buffer));
+}
+
+/* |1|132.235.1.2|6275| */
+static int
+mangle_epsv_packet(struct sk_buff **pskb,
+		   u_int32_t newip,
+		   u_int16_t port,
+		   unsigned int matchoff,
+		   unsigned int matchlen,
+		   struct ip_conntrack *ct,
+		   enum ip_conntrack_info ctinfo,
+		   u32 *seq)
+{
+	char buffer[sizeof("|||65535|")];
+
+	sprintf(buffer, "|||%u|", port);
+
+	DEBUGP("calling ip_nat_mangle_tcp_packet\n");
+
+	*seq += strlen(buffer) - matchlen;
+	return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, 
+					matchlen, buffer, strlen(buffer));
+}
+
+static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t,
+		     unsigned int,
+		     unsigned int,
+		     struct ip_conntrack *,
+		     enum ip_conntrack_info,
+		     u32 *seq)
+= { [IP_CT_FTP_PORT] = mangle_rfc959_packet,
+    [IP_CT_FTP_PASV] = mangle_rfc959_packet,
+    [IP_CT_FTP_EPRT] = mangle_eprt_packet,
+    [IP_CT_FTP_EPSV] = mangle_epsv_packet
+};
+
+/* So, this packet has hit the connection tracking matching code.
+   Mangle it, and change the expectation to match the new version. */
+static unsigned int ip_nat_ftp(struct sk_buff **pskb,
+			       enum ip_conntrack_info ctinfo,
+			       enum ip_ct_ftp_type type,
+			       unsigned int matchoff,
+			       unsigned int matchlen,
+			       struct ip_conntrack_expect *exp,
+			       u32 *seq)
+{
+	u_int32_t newip;
+	u_int16_t port;
+	int dir = CTINFO2DIR(ctinfo);
+	struct ip_conntrack *ct = exp->master;
+
+	DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
+
+	/* Connection will come from wherever this packet goes, hence !dir */
+	newip = ct->tuplehash[!dir].tuple.dst.ip;
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = !dir;
+
+	/* When you see the packet, we need to NAT it the same as the
+	 * this one. */
+	exp->expectfn = ip_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		exp->tuple.dst.u.tcp.port = htons(port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (port == 0) {
+		ip_conntrack_expect_free(exp);
+		return NF_DROP;
+	}
+
+	if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo,
+			  seq)) {
+		ip_conntrack_unexpect_related(exp);
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_ftp_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_ftp_hook);
+	ip_nat_ftp_hook = ip_nat_ftp;
+	return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+	printk(KERN_INFO __stringify(KBUILD_MODNAME)
+	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+	return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
new file mode 100644
index 000000000000..1637b96d8c01
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -0,0 +1,430 @@
+/* ip_nat_helper.c - generic support functions for NAT helpers 
+ *
+ * (C) 2000-2002 Harald Welte <laforge@netfilter.org>
+ * (C) 2003-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 	14 Jan 2002 Harald Welte <laforge@gnumonks.org>:
+ *		- add support for SACK adjustment 
+ *	14 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ *		- merge SACK support into newnat API
+ *	16 Aug 2002 Brian J. Murrell <netfilter@interlinx.bc.ca>:
+ *		- make ip_nat_resize_packet more generic (TCP and UDP)
+ *		- add ip_nat_mangle_udp_packet
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/checksum.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#define DUMP_OFFSET(x)	printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos);
+#else
+#define DEBUGP(format, args...)
+#define DUMP_OFFSET(x)
+#endif
+
+static DECLARE_LOCK(ip_nat_seqofs_lock);
+
+/* Setup TCP sequence correction given this change at this sequence */
+static inline void 
+adjust_tcp_sequence(u32 seq,
+		    int sizediff,
+		    struct ip_conntrack *ct, 
+		    enum ip_conntrack_info ctinfo)
+{
+	int dir;
+	struct ip_nat_seq *this_way, *other_way;
+
+	DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n",
+		(*skb)->len, new_size);
+
+	dir = CTINFO2DIR(ctinfo);
+
+	this_way = &ct->nat.info.seq[dir];
+	other_way = &ct->nat.info.seq[!dir];
+
+	DEBUGP("ip_nat_resize_packet: Seq_offset before: ");
+	DUMP_OFFSET(this_way);
+
+	LOCK_BH(&ip_nat_seqofs_lock);
+
+	/* SYN adjust. If it's uninitialized, or this is after last
+	 * correction, record it: we don't handle more than one
+	 * adjustment in the window, but do deal with common case of a
+	 * retransmit */
+	if (this_way->offset_before == this_way->offset_after
+	    || before(this_way->correction_pos, seq)) {
+		    this_way->correction_pos = seq;
+		    this_way->offset_before = this_way->offset_after;
+		    this_way->offset_after += sizediff;
+	}
+	UNLOCK_BH(&ip_nat_seqofs_lock);
+
+	DEBUGP("ip_nat_resize_packet: Seq_offset after: ");
+	DUMP_OFFSET(this_way);
+}
+
+/* Frobs data inside this packet, which is linear. */
+static void mangle_contents(struct sk_buff *skb,
+			    unsigned int dataoff,
+			    unsigned int match_offset,
+			    unsigned int match_len,
+			    const char *rep_buffer,
+			    unsigned int rep_len)
+{
+	unsigned char *data;
+
+	BUG_ON(skb_is_nonlinear(skb));
+	data = (unsigned char *)skb->nh.iph + dataoff;
+
+	/* move post-replacement */
+	memmove(data + match_offset + rep_len,
+		data + match_offset + match_len,
+		skb->tail - (data + match_offset + match_len));
+
+	/* insert data from buffer */
+	memcpy(data + match_offset, rep_buffer, rep_len);
+
+	/* update skb info */
+	if (rep_len > match_len) {
+		DEBUGP("ip_nat_mangle_packet: Extending packet by "
+			"%u from %u bytes\n", rep_len - match_len,
+		       skb->len);
+		skb_put(skb, rep_len - match_len);
+	} else {
+		DEBUGP("ip_nat_mangle_packet: Shrinking packet from "
+			"%u from %u bytes\n", match_len - rep_len,
+		       skb->len);
+		__skb_trim(skb, skb->len + rep_len - match_len);
+	}
+
+	/* fix IP hdr checksum information */
+	skb->nh.iph->tot_len = htons(skb->len);
+	ip_send_check(skb->nh.iph);
+}
+
+/* Unusual, but possible case. */
+static int enlarge_skb(struct sk_buff **pskb, unsigned int extra)
+{
+	struct sk_buff *nskb;
+
+	if ((*pskb)->len + extra > 65535)
+		return 0;
+
+	nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC);
+	if (!nskb)
+		return 0;
+
+	/* Transfer socket to new skb. */
+	if ((*pskb)->sk)
+		skb_set_owner_w(nskb, (*pskb)->sk);
+#ifdef CONFIG_NETFILTER_DEBUG
+	nskb->nf_debug = (*pskb)->nf_debug;
+#endif
+	kfree_skb(*pskb);
+	*pskb = nskb;
+	return 1;
+}
+
+/* Generic function for mangling variable-length address changes inside
+ * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
+ * command in FTP).
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * */
+int 
+ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
+			 struct ip_conntrack *ct,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int match_offset,
+			 unsigned int match_len,
+			 const char *rep_buffer,
+			 unsigned int rep_len)
+{
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	int datalen;
+
+	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+		return 0;
+
+	if (rep_len > match_len
+	    && rep_len - match_len > skb_tailroom(*pskb)
+	    && !enlarge_skb(pskb, rep_len - match_len))
+		return 0;
+
+	SKB_LINEAR_ASSERT(*pskb);
+
+	iph = (*pskb)->nh.iph;
+	tcph = (void *)iph + iph->ihl*4;
+
+	mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4,
+			match_offset, match_len, rep_buffer, rep_len);
+
+	datalen = (*pskb)->len - iph->ihl*4;
+	tcph->check = 0;
+	tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr,
+				   csum_partial((char *)tcph, datalen, 0));
+
+	if (rep_len != match_len) {
+		set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+		adjust_tcp_sequence(ntohl(tcph->seq),
+				    (int)rep_len - (int)match_len,
+				    ct, ctinfo);
+		/* Tell TCP window tracking about seq change */
+		ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo));
+	}
+	return 1;
+}
+			
+/* Generic function for mangling variable-length address changes inside
+ * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
+ * command in the Amanda protocol)
+ *
+ * Takes care about all the nasty sequence number changes, checksumming,
+ * skb enlargement, ...
+ *
+ * XXX - This function could be merged with ip_nat_mangle_tcp_packet which
+ *       should be fairly easy to do.
+ */
+int 
+ip_nat_mangle_udp_packet(struct sk_buff **pskb,
+			 struct ip_conntrack *ct,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int match_offset,
+			 unsigned int match_len,
+			 const char *rep_buffer,
+			 unsigned int rep_len)
+{
+	struct iphdr *iph;
+	struct udphdr *udph;
+
+	/* UDP helpers might accidentally mangle the wrong packet */
+	iph = (*pskb)->nh.iph;
+	if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + 
+	                       match_offset + match_len)
+		return 0;
+
+	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+		return 0;
+
+	if (rep_len > match_len
+	    && rep_len - match_len > skb_tailroom(*pskb)
+	    && !enlarge_skb(pskb, rep_len - match_len))
+		return 0;
+
+	iph = (*pskb)->nh.iph;
+	udph = (void *)iph + iph->ihl*4;
+	mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph),
+			match_offset, match_len, rep_buffer, rep_len);
+
+	/* update the length of the UDP packet */
+	udph->len = htons((*pskb)->len - iph->ihl*4);
+
+	/* fix udp checksum if udp checksum was previously calculated */
+	if (udph->check) {
+		int datalen = (*pskb)->len - iph->ihl * 4;
+		udph->check = 0;
+		udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+		                                datalen, IPPROTO_UDP,
+		                                csum_partial((char *)udph,
+		                                             datalen, 0));
+	}
+
+	return 1;
+}
+
+/* Adjust one found SACK option including checksum correction */
+static void
+sack_adjust(struct sk_buff *skb,
+	    struct tcphdr *tcph, 
+	    unsigned int sackoff,
+	    unsigned int sackend,
+	    struct ip_nat_seq *natseq)
+{
+	while (sackoff < sackend) {
+		struct tcp_sack_block *sack;
+		u_int32_t new_start_seq, new_end_seq;
+
+		sack = (void *)skb->data + sackoff;
+		if (after(ntohl(sack->start_seq) - natseq->offset_before,
+			  natseq->correction_pos))
+			new_start_seq = ntohl(sack->start_seq) 
+					- natseq->offset_after;
+		else
+			new_start_seq = ntohl(sack->start_seq) 
+					- natseq->offset_before;
+		new_start_seq = htonl(new_start_seq);
+
+		if (after(ntohl(sack->end_seq) - natseq->offset_before,
+			  natseq->correction_pos))
+			new_end_seq = ntohl(sack->end_seq)
+				      - natseq->offset_after;
+		else
+			new_end_seq = ntohl(sack->end_seq)
+				      - natseq->offset_before;
+		new_end_seq = htonl(new_end_seq);
+
+		DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n",
+			ntohl(sack->start_seq), new_start_seq,
+			ntohl(sack->end_seq), new_end_seq);
+
+		tcph->check = 
+			ip_nat_cheat_check(~sack->start_seq, new_start_seq,
+					   ip_nat_cheat_check(~sack->end_seq, 
+						   	      new_end_seq,
+							      tcph->check));
+		sack->start_seq = new_start_seq;
+		sack->end_seq = new_end_seq;
+		sackoff += sizeof(*sack);
+	}
+}
+
+/* TCP SACK sequence number adjustment */
+static inline unsigned int
+ip_nat_sack_adjust(struct sk_buff **pskb,
+		   struct tcphdr *tcph,
+		   struct ip_conntrack *ct,
+		   enum ip_conntrack_info ctinfo)
+{
+	unsigned int dir, optoff, optend;
+
+	optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr);
+	optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4;
+
+	if (!skb_ip_make_writable(pskb, optend))
+		return 0;
+
+	dir = CTINFO2DIR(ctinfo);
+
+	while (optoff < optend) {
+		/* Usually: option, length. */
+		unsigned char *op = (*pskb)->data + optoff;
+
+		switch (op[0]) {
+		case TCPOPT_EOL:
+			return 1;
+		case TCPOPT_NOP:
+			optoff++;
+			continue;
+		default:
+			/* no partial options */
+			if (optoff + 1 == optend
+			    || optoff + op[1] > optend
+			    || op[1] < 2)
+				return 0;
+			if (op[0] == TCPOPT_SACK
+			    && op[1] >= 2+TCPOLEN_SACK_PERBLOCK
+			    && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0)
+				sack_adjust(*pskb, tcph, optoff+2,
+					    optoff+op[1],
+					    &ct->nat.info.seq[!dir]);
+			optoff += op[1];
+		}
+	}
+	return 1;
+}
+
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
+int
+ip_nat_seq_adjust(struct sk_buff **pskb, 
+		  struct ip_conntrack *ct, 
+		  enum ip_conntrack_info ctinfo)
+{
+	struct tcphdr *tcph;
+	int dir, newseq, newack;
+	struct ip_nat_seq *this_way, *other_way;	
+
+	dir = CTINFO2DIR(ctinfo);
+
+	this_way = &ct->nat.info.seq[dir];
+	other_way = &ct->nat.info.seq[!dir];
+
+	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+		return 0;
+
+	tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4;
+	if (after(ntohl(tcph->seq), this_way->correction_pos))
+		newseq = ntohl(tcph->seq) + this_way->offset_after;
+	else
+		newseq = ntohl(tcph->seq) + this_way->offset_before;
+	newseq = htonl(newseq);
+
+	if (after(ntohl(tcph->ack_seq) - other_way->offset_before,
+		  other_way->correction_pos))
+		newack = ntohl(tcph->ack_seq) - other_way->offset_after;
+	else
+		newack = ntohl(tcph->ack_seq) - other_way->offset_before;
+	newack = htonl(newack);
+
+	tcph->check = ip_nat_cheat_check(~tcph->seq, newseq,
+					 ip_nat_cheat_check(~tcph->ack_seq, 
+					 		    newack, 
+							    tcph->check));
+
+	DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n",
+		ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq),
+		ntohl(newack));
+
+	tcph->seq = newseq;
+	tcph->ack_seq = newack;
+
+	if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo))
+		return 0;
+
+	ip_conntrack_tcp_update(*pskb, ct, dir);
+
+	return 1;
+}
+
+/* Setup NAT on this expected conntrack so it follows master. */
+/* If we fail to get a free NAT slot, we'll get dropped on confirm */
+void ip_nat_follow_master(struct ip_conntrack *ct,
+			  struct ip_conntrack_expect *exp)
+{
+	struct ip_nat_range range;
+
+	/* This must be a fresh one. */
+	BUG_ON(ct->status & IPS_NAT_DONE_MASK);
+
+	/* Change src to where master sends to */
+	range.flags = IP_NAT_RANGE_MAP_IPS;
+	range.min_ip = range.max_ip
+		= ct->master->tuplehash[!exp->dir].tuple.dst.ip;
+	/* hook doesn't matter, but it has to do source manip */
+	ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
+
+	/* For DST manip, map port here to where it's expected. */
+	range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
+	range.min = range.max = exp->saved_proto;
+	range.min_ip = range.max_ip
+		= ct->master->tuplehash[!exp->dir].tuple.src.ip;
+	/* hook doesn't matter, but it has to do destination manip */
+	ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
+}
diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c
new file mode 100644
index 000000000000..9c1ca3381d56
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_irc.c
@@ -0,0 +1,125 @@
+/* IRC extension for TCP NAT alteration.
+ * (C) 2000-2001 by Harald Welte <laforge@gnumonks.org>
+ * (C) 2004 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation
+ * based on a copy of RR's ip_nat_ftp.c
+ *
+ * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_conntrack_irc.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/moduleparam.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("IRC (DCC) NAT helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int help(struct sk_buff **pskb,
+			 enum ip_conntrack_info ctinfo,
+			 unsigned int matchoff,
+			 unsigned int matchlen,
+			 struct ip_conntrack_expect *exp)
+{
+	u_int16_t port;
+	unsigned int ret;
+
+	/* "4294967296 65635 " */
+	char buffer[18];
+
+	DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n",
+	       expect->seq, exp_irc_info->len,
+	       ntohl(tcph->seq));
+
+	/* Reply comes from server. */
+	exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = IP_CT_DIR_REPLY;
+
+	/* When you see the packet, we need to NAT it the same as the
+	 * this one. */
+	exp->expectfn = ip_nat_follow_master;
+
+	/* Try to get same port: if not, try to change it. */
+	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		exp->tuple.dst.u.tcp.port = htons(port);
+		if (ip_conntrack_expect_related(exp) == 0)
+			break;
+	}
+
+	if (port == 0) {
+		ip_conntrack_expect_free(exp);
+		return NF_DROP;
+	}
+
+	/*      strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27
+	 *      strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28
+	 *      strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26
+	 *      strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26
+	 *      strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27
+	 *              AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits,
+	 *                      255.255.255.255==4294967296, 10 digits)
+	 *              P:         bound port (min 1 d, max 5d (65635))
+	 *              F:         filename   (min 1 d )
+	 *              S:         size       (min 1 d )
+	 *              0x01, \n:  terminators
+	 */
+
+	/* AAA = "us", ie. where server normally talks to. */
+	sprintf(buffer, "%u %u",
+		ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip),
+		port);
+	DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n",
+	       buffer, NIPQUAD(exp->tuple.src.ip), port);
+
+	ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, 
+				       matchoff, matchlen, buffer, 
+				       strlen(buffer));
+	if (ret != NF_ACCEPT)
+		ip_conntrack_unexpect_related(exp);
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_irc_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_irc_hook);
+	ip_nat_irc_hook = help;
+	return 0;
+}
+
+/* Prior to 2.6.11, we had a ports param.  No longer, but don't break users. */
+static int warn_set(const char *val, struct kernel_param *kp)
+{
+	printk(KERN_INFO __stringify(KBUILD_MODNAME)
+	       ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n");
+	return 0;
+}
+module_param_call(ports, warn_set, NULL, NULL, 0);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c
new file mode 100644
index 000000000000..a558cf0eee8a
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c
@@ -0,0 +1,115 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+icmp_in_range(const struct ip_conntrack_tuple *tuple,
+	      enum ip_nat_manip_type maniptype,
+	      const union ip_conntrack_manip_proto *min,
+	      const union ip_conntrack_manip_proto *max)
+{
+	return (tuple->src.u.icmp.id >= min->icmp.id
+		&& tuple->src.u.icmp.id <= max->icmp.id);
+}
+
+static int
+icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
+		  const struct ip_nat_range *range,
+		  enum ip_nat_manip_type maniptype,
+		  const struct ip_conntrack *conntrack)
+{
+	static u_int16_t id;
+	unsigned int range_size
+		= (unsigned int)range->max.icmp.id - range->min.icmp.id + 1;
+	unsigned int i;
+
+	/* If no range specified... */
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED))
+		range_size = 0xFFFF;
+
+	for (i = 0; i < range_size; i++, id++) {
+		tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size);
+		if (!ip_nat_used_tuple(tuple, conntrack))
+			return 1;
+	}
+	return 0;
+}
+
+static int
+icmp_manip_pkt(struct sk_buff **pskb,
+	       unsigned int iphdroff,
+	       const struct ip_conntrack_tuple *tuple,
+	       enum ip_nat_manip_type maniptype)
+{
+	struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+	struct icmphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+
+	if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+		return 0;
+
+	hdr = (struct icmphdr *)((*pskb)->data + hdroff);
+
+	hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
+					    tuple->src.u.icmp.id,
+					    hdr->checksum);
+	hdr->un.echo.id = tuple->src.u.icmp.id;
+	return 1;
+}
+
+static unsigned int
+icmp_print(char *buffer,
+	   const struct ip_conntrack_tuple *match,
+	   const struct ip_conntrack_tuple *mask)
+{
+	unsigned int len = 0;
+
+	if (mask->src.u.icmp.id)
+		len += sprintf(buffer + len, "id=%u ",
+			       ntohs(match->src.u.icmp.id));
+
+	if (mask->dst.u.icmp.type)
+		len += sprintf(buffer + len, "type=%u ",
+			       ntohs(match->dst.u.icmp.type));
+
+	if (mask->dst.u.icmp.code)
+		len += sprintf(buffer + len, "code=%u ",
+			       ntohs(match->dst.u.icmp.code));
+
+	return len;
+}
+
+static unsigned int
+icmp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+	if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF)
+		return sprintf(buffer, "id %u-%u ",
+			       ntohs(range->min.icmp.id),
+			       ntohs(range->max.icmp.id));
+	else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_icmp
+= { "ICMP", IPPROTO_ICMP,
+    icmp_manip_pkt,
+    icmp_in_range,
+    icmp_unique_tuple,
+    icmp_print,
+    icmp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c
new file mode 100644
index 000000000000..a91cfceff272
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c
@@ -0,0 +1,178 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+
+static int
+tcp_in_range(const struct ip_conntrack_tuple *tuple,
+	     enum ip_nat_manip_type maniptype,
+	     const union ip_conntrack_manip_proto *min,
+	     const union ip_conntrack_manip_proto *max)
+{
+	u_int16_t port;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		port = tuple->src.u.tcp.port;
+	else
+		port = tuple->dst.u.tcp.port;
+
+	return ntohs(port) >= ntohs(min->tcp.port)
+		&& ntohs(port) <= ntohs(max->tcp.port);
+}
+
+static int
+tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
+		 const struct ip_nat_range *range,
+		 enum ip_nat_manip_type maniptype,
+		 const struct ip_conntrack *conntrack)
+{
+	static u_int16_t port, *portptr;
+	unsigned int range_size, min, i;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		portptr = &tuple->src.u.tcp.port;
+	else
+		portptr = &tuple->dst.u.tcp.port;
+
+	/* If no range specified... */
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+		/* If it's dst rewrite, can't change port */
+		if (maniptype == IP_NAT_MANIP_DST)
+			return 0;
+
+		/* Map privileged onto privileged. */
+		if (ntohs(*portptr) < 1024) {
+			/* Loose convention: >> 512 is credential passing */
+			if (ntohs(*portptr)<512) {
+				min = 1;
+				range_size = 511 - min + 1;
+			} else {
+				min = 600;
+				range_size = 1023 - min + 1;
+			}
+		} else {
+			min = 1024;
+			range_size = 65535 - 1024 + 1;
+		}
+	} else {
+		min = ntohs(range->min.tcp.port);
+		range_size = ntohs(range->max.tcp.port) - min + 1;
+	}
+
+	for (i = 0; i < range_size; i++, port++) {
+		*portptr = htons(min + port % range_size);
+		if (!ip_nat_used_tuple(tuple, conntrack)) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int
+tcp_manip_pkt(struct sk_buff **pskb,
+	      unsigned int iphdroff,
+	      const struct ip_conntrack_tuple *tuple,
+	      enum ip_nat_manip_type maniptype)
+{
+	struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+	struct tcphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+	u32 oldip, newip;
+	u16 *portptr, newport, oldport;
+	int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+	/* this could be a inner header returned in icmp packet; in such
+	   cases we cannot update the checksum field since it is outside of
+	   the 8 bytes of transport layer headers we are guaranteed */
+	if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
+		hdrsize = sizeof(struct tcphdr);
+
+	if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+		return 0;
+
+	iph = (struct iphdr *)((*pskb)->data + iphdroff);
+	hdr = (struct tcphdr *)((*pskb)->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		/* Get rid of src ip and src pt */
+		oldip = iph->saddr;
+		newip = tuple->src.ip;
+		newport = tuple->src.u.tcp.port;
+		portptr = &hdr->source;
+	} else {
+		/* Get rid of dst ip and dst pt */
+		oldip = iph->daddr;
+		newip = tuple->dst.ip;
+		newport = tuple->dst.u.tcp.port;
+		portptr = &hdr->dest;
+	}
+
+	oldport = *portptr;
+	*portptr = newport;
+
+	if (hdrsize < sizeof(*hdr))
+		return 1;
+
+	hdr->check = ip_nat_cheat_check(~oldip, newip,
+					ip_nat_cheat_check(oldport ^ 0xFFFF,
+							   newport,
+							   hdr->check));
+	return 1;
+}
+
+static unsigned int
+tcp_print(char *buffer,
+	  const struct ip_conntrack_tuple *match,
+	  const struct ip_conntrack_tuple *mask)
+{
+	unsigned int len = 0;
+
+	if (mask->src.u.tcp.port)
+		len += sprintf(buffer + len, "srcpt=%u ",
+			       ntohs(match->src.u.tcp.port));
+
+
+	if (mask->dst.u.tcp.port)
+		len += sprintf(buffer + len, "dstpt=%u ",
+			       ntohs(match->dst.u.tcp.port));
+
+	return len;
+}
+
+static unsigned int
+tcp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+	if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
+		if (range->min.tcp.port == range->max.tcp.port)
+			return sprintf(buffer, "port %u ",
+				       ntohs(range->min.tcp.port));
+		else
+			return sprintf(buffer, "ports %u-%u ",
+				       ntohs(range->min.tcp.port),
+				       ntohs(range->max.tcp.port));
+	}
+	else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_tcp
+= { "TCP", IPPROTO_TCP,
+    tcp_manip_pkt,
+    tcp_in_range,
+    tcp_unique_tuple,
+    tcp_print,
+    tcp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c
new file mode 100644
index 000000000000..c669e3b5f5d0
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_udp.c
@@ -0,0 +1,165 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+udp_in_range(const struct ip_conntrack_tuple *tuple,
+	     enum ip_nat_manip_type maniptype,
+	     const union ip_conntrack_manip_proto *min,
+	     const union ip_conntrack_manip_proto *max)
+{
+	u_int16_t port;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		port = tuple->src.u.udp.port;
+	else
+		port = tuple->dst.u.udp.port;
+
+	return ntohs(port) >= ntohs(min->udp.port)
+		&& ntohs(port) <= ntohs(max->udp.port);
+}
+
+static int
+udp_unique_tuple(struct ip_conntrack_tuple *tuple,
+		 const struct ip_nat_range *range,
+		 enum ip_nat_manip_type maniptype,
+		 const struct ip_conntrack *conntrack)
+{
+	static u_int16_t port, *portptr;
+	unsigned int range_size, min, i;
+
+	if (maniptype == IP_NAT_MANIP_SRC)
+		portptr = &tuple->src.u.udp.port;
+	else
+		portptr = &tuple->dst.u.udp.port;
+
+	/* If no range specified... */
+	if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+		/* If it's dst rewrite, can't change port */
+		if (maniptype == IP_NAT_MANIP_DST)
+			return 0;
+
+		if (ntohs(*portptr) < 1024) {
+			/* Loose convention: >> 512 is credential passing */
+			if (ntohs(*portptr)<512) {
+				min = 1;
+				range_size = 511 - min + 1;
+			} else {
+				min = 600;
+				range_size = 1023 - min + 1;
+			}
+		} else {
+			min = 1024;
+			range_size = 65535 - 1024 + 1;
+		}
+	} else {
+		min = ntohs(range->min.udp.port);
+		range_size = ntohs(range->max.udp.port) - min + 1;
+	}
+
+	for (i = 0; i < range_size; i++, port++) {
+		*portptr = htons(min + port % range_size);
+		if (!ip_nat_used_tuple(tuple, conntrack))
+			return 1;
+	}
+	return 0;
+}
+
+static int
+udp_manip_pkt(struct sk_buff **pskb,
+	      unsigned int iphdroff,
+	      const struct ip_conntrack_tuple *tuple,
+	      enum ip_nat_manip_type maniptype)
+{
+	struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+	struct udphdr *hdr;
+	unsigned int hdroff = iphdroff + iph->ihl*4;
+	u32 oldip, newip;
+	u16 *portptr, newport;
+
+	if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+		return 0;
+
+	iph = (struct iphdr *)((*pskb)->data + iphdroff);
+	hdr = (struct udphdr *)((*pskb)->data + hdroff);
+
+	if (maniptype == IP_NAT_MANIP_SRC) {
+		/* Get rid of src ip and src pt */
+		oldip = iph->saddr;
+		newip = tuple->src.ip;
+		newport = tuple->src.u.udp.port;
+		portptr = &hdr->source;
+	} else {
+		/* Get rid of dst ip and dst pt */
+		oldip = iph->daddr;
+		newip = tuple->dst.ip;
+		newport = tuple->dst.u.udp.port;
+		portptr = &hdr->dest;
+	}
+	if (hdr->check) /* 0 is a special case meaning no checksum */
+		hdr->check = ip_nat_cheat_check(~oldip, newip,
+					ip_nat_cheat_check(*portptr ^ 0xFFFF,
+							   newport,
+							   hdr->check));
+	*portptr = newport;
+	return 1;
+}
+
+static unsigned int
+udp_print(char *buffer,
+	  const struct ip_conntrack_tuple *match,
+	  const struct ip_conntrack_tuple *mask)
+{
+	unsigned int len = 0;
+
+	if (mask->src.u.udp.port)
+		len += sprintf(buffer + len, "srcpt=%u ",
+			       ntohs(match->src.u.udp.port));
+
+
+	if (mask->dst.u.udp.port)
+		len += sprintf(buffer + len, "dstpt=%u ",
+			       ntohs(match->dst.u.udp.port));
+
+	return len;
+}
+
+static unsigned int
+udp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+	if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
+		if (range->min.udp.port == range->max.udp.port)
+			return sprintf(buffer, "port %u ",
+				       ntohs(range->min.udp.port));
+		else
+			return sprintf(buffer, "ports %u-%u ",
+				       ntohs(range->min.udp.port),
+				       ntohs(range->max.udp.port));
+	}
+	else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_udp
+= { "UDP", IPPROTO_UDP,
+    udp_manip_pkt,
+    udp_in_range,
+    udp_unique_tuple,
+    udp_print,
+    udp_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c
new file mode 100644
index 000000000000..f5525bd58d16
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c
@@ -0,0 +1,70 @@
+/* The "unknown" protocol.  This is what is used for protocols we
+ * don't understand.  It's returned by ip_ct_find_proto().
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int unknown_in_range(const struct ip_conntrack_tuple *tuple,
+			    enum ip_nat_manip_type manip_type,
+			    const union ip_conntrack_manip_proto *min,
+			    const union ip_conntrack_manip_proto *max)
+{
+	return 1;
+}
+
+static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
+				const struct ip_nat_range *range,
+				enum ip_nat_manip_type maniptype,
+				const struct ip_conntrack *conntrack)
+{
+	/* Sorry: we can't help you; if it's not unique, we can't frob
+	   anything. */
+	return 0;
+}
+
+static int
+unknown_manip_pkt(struct sk_buff **pskb,
+		  unsigned int iphdroff,
+		  const struct ip_conntrack_tuple *tuple,
+		  enum ip_nat_manip_type maniptype)
+{
+	return 1;
+}
+
+static unsigned int
+unknown_print(char *buffer,
+	      const struct ip_conntrack_tuple *match,
+	      const struct ip_conntrack_tuple *mask)
+{
+	return 0;
+}
+
+static unsigned int
+unknown_print_range(char *buffer, const struct ip_nat_range *range)
+{
+	return 0;
+}
+
+struct ip_nat_protocol ip_nat_unknown_protocol = {
+	"unknown", 0,
+	unknown_manip_pkt,
+	unknown_in_range,
+	unknown_unique_tuple,
+	unknown_print,
+	unknown_print_range
+};
diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c
new file mode 100644
index 000000000000..581f097f5a24
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_rule.c
@@ -0,0 +1,319 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/* Everything about the rules for NAT. */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/checksum.h>
+#include <net/route.h>
+#include <linux/bitops.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define NAT_VALID_HOOKS ((1<<NF_IP_PRE_ROUTING) | (1<<NF_IP_POST_ROUTING) | (1<<NF_IP_LOCAL_OUT))
+
+static struct
+{
+	struct ipt_replace repl;
+	struct ipt_standard entries[3];
+	struct ipt_error term;
+} nat_initial_table __initdata
+= { { "nat", NAT_VALID_HOOKS, 4,
+      sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+      { [NF_IP_PRE_ROUTING] = 0,
+	[NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
+	[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      { [NF_IP_PRE_ROUTING] = 0,
+	[NF_IP_POST_ROUTING] = sizeof(struct ipt_standard),
+	[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      0, NULL, { } },
+    {
+	    /* PRE_ROUTING */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* POST_ROUTING */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* LOCAL_OUT */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } }
+    },
+    /* ERROR */
+    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+	0,
+	sizeof(struct ipt_entry),
+	sizeof(struct ipt_error),
+	0, { 0, 0 }, { } },
+      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+	  { } },
+	"ERROR"
+      }
+    }
+};
+
+static struct ipt_table nat_table = {
+	.name		= "nat",
+	.valid_hooks	= NAT_VALID_HOOKS,
+	.lock		= RW_LOCK_UNLOCKED,
+	.me		= THIS_MODULE,
+};
+
+/* Source NAT */
+static unsigned int ipt_snat_target(struct sk_buff **pskb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    unsigned int hooknum,
+				    const void *targinfo,
+				    void *userinfo)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+
+	IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+
+	/* Connection must be valid and new. */
+	IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
+	                    || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+	IP_NF_ASSERT(out);
+
+	return ip_nat_setup_info(ct, &mr->range[0], hooknum);
+}
+
+/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
+static void warn_if_extra_mangle(u32 dstip, u32 srcip)
+{
+	static int warned = 0;
+	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
+	struct rtable *rt;
+
+	if (ip_route_output_key(&rt, &fl) != 0)
+		return;
+
+	if (rt->rt_src != srcip && !warned) {
+		printk("NAT: no longer support implicit source local NAT\n");
+		printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
+		       NIPQUAD(srcip), NIPQUAD(dstip));
+		warned = 1;
+	}
+	ip_rt_put(rt);
+}
+
+static unsigned int ipt_dnat_target(struct sk_buff **pskb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    unsigned int hooknum,
+				    const void *targinfo,
+				    void *userinfo)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+
+	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+		     || hooknum == NF_IP_LOCAL_OUT);
+
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+
+	/* Connection must be valid and new. */
+	IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+	if (hooknum == NF_IP_LOCAL_OUT
+	    && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
+		warn_if_extra_mangle((*pskb)->nh.iph->daddr,
+				     mr->range[0].min_ip);
+
+	return ip_nat_setup_info(ct, &mr->range[0], hooknum);
+}
+
+static int ipt_snat_checkentry(const char *tablename,
+			       const struct ipt_entry *e,
+			       void *targinfo,
+			       unsigned int targinfosize,
+			       unsigned int hook_mask)
+{
+	struct ip_nat_multi_range_compat *mr = targinfo;
+
+	/* Must be a valid range */
+	if (mr->rangesize != 1) {
+		printk("SNAT: multiple ranges no longer supported\n");
+		return 0;
+	}
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
+		DEBUGP("SNAT: Target size %u wrong for %u ranges\n",
+		       targinfosize, mr->rangesize);
+		return 0;
+	}
+
+	/* Only allow these for NAT. */
+	if (strcmp(tablename, "nat") != 0) {
+		DEBUGP("SNAT: wrong table %s\n", tablename);
+		return 0;
+	}
+
+	if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+		DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask);
+		return 0;
+	}
+	return 1;
+}
+
+static int ipt_dnat_checkentry(const char *tablename,
+			       const struct ipt_entry *e,
+			       void *targinfo,
+			       unsigned int targinfosize,
+			       unsigned int hook_mask)
+{
+	struct ip_nat_multi_range_compat *mr = targinfo;
+
+	/* Must be a valid range */
+	if (mr->rangesize != 1) {
+		printk("DNAT: multiple ranges no longer supported\n");
+		return 0;
+	}
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) {
+		DEBUGP("DNAT: Target size %u wrong for %u ranges\n",
+		       targinfosize, mr->rangesize);
+		return 0;
+	}
+
+	/* Only allow these for NAT. */
+	if (strcmp(tablename, "nat") != 0) {
+		DEBUGP("DNAT: wrong table %s\n", tablename);
+		return 0;
+	}
+
+	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+		DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask);
+		return 0;
+	}
+	
+	return 1;
+}
+
+inline unsigned int
+alloc_null_binding(struct ip_conntrack *conntrack,
+		   struct ip_nat_info *info,
+		   unsigned int hooknum)
+{
+	/* Force range to this IP; let proto decide mapping for
+	   per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+	   Use reply in case it's already been mangled (eg local packet).
+	*/
+	u_int32_t ip
+		= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
+		   ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip
+		   : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+	struct ip_nat_range range
+		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
+
+	DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack,
+	       NIPQUAD(ip));
+	return ip_nat_setup_info(conntrack, &range, hooknum);
+}
+
+int ip_nat_rule_find(struct sk_buff **pskb,
+		     unsigned int hooknum,
+		     const struct net_device *in,
+		     const struct net_device *out,
+		     struct ip_conntrack *ct,
+		     struct ip_nat_info *info)
+{
+	int ret;
+
+	ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL);
+
+	if (ret == NF_ACCEPT) {
+		if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum)))
+			/* NUL mapping */
+			ret = alloc_null_binding(ct, info, hooknum);
+	}
+	return ret;
+}
+
+static struct ipt_target ipt_snat_reg = {
+	.name		= "SNAT",
+	.target		= ipt_snat_target,
+	.checkentry	= ipt_snat_checkentry,
+};
+
+static struct ipt_target ipt_dnat_reg = {
+	.name		= "DNAT",
+	.target		= ipt_dnat_target,
+	.checkentry	= ipt_dnat_checkentry,
+};
+
+int __init ip_nat_rule_init(void)
+{
+	int ret;
+
+	ret = ipt_register_table(&nat_table, &nat_initial_table.repl);
+	if (ret != 0)
+		return ret;
+	ret = ipt_register_target(&ipt_snat_reg);
+	if (ret != 0)
+		goto unregister_table;
+
+	ret = ipt_register_target(&ipt_dnat_reg);
+	if (ret != 0)
+		goto unregister_snat;
+
+	return ret;
+
+ unregister_snat:
+	ipt_unregister_target(&ipt_snat_reg);
+ unregister_table:
+	ipt_unregister_table(&nat_table);
+
+	return ret;
+}
+
+void ip_nat_rule_cleanup(void)
+{
+	ipt_unregister_target(&ipt_dnat_reg);
+	ipt_unregister_target(&ipt_snat_reg);
+	ipt_unregister_table(&nat_table);
+}
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c
new file mode 100644
index 000000000000..2a48b6e635ae
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c
@@ -0,0 +1,1347 @@
+/*
+ * ip_nat_snmp_basic.c
+ *
+ * Basic SNMP Application Layer Gateway
+ *
+ * This IP NAT module is intended for use with SNMP network 
+ * discovery and monitoring applications where target networks use 
+ * conflicting private address realms.
+ *
+ * Static NAT is used to remap the networks from the view of the network 
+ * management system at the IP layer, and this module remaps some application
+ * layer addresses to match.
+ *
+ * The simplest form of ALG is performed, where only tagged IP addresses
+ * are modified.  The module does not need to be MIB aware and only scans
+ * messages at the ASN.1/BER level.
+ *
+ * Currently, only SNMPv1 and SNMPv2 are supported.
+ *
+ * More information on ALG and associated issues can be found in
+ * RFC 2962
+ *
+ * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory 
+ * McLean & Jochen Friedrich, stripped down for use in the kernel.
+ *
+ * Copyright (c) 2000 RP Internet (www.rpi.net.au).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
+ *
+ * Author: James Morris <jmorris@intercode.com.au>
+ *
+ * Updates:
+ * 2000-08-06: Convert to new helper API (Harald Welte).
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+#include <net/udp.h>
+#include <asm/uaccess.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway");
+
+#define SNMP_PORT 161
+#define SNMP_TRAP_PORT 162
+#define NOCT1(n) (u_int8_t )((n) & 0xff)
+
+static int debug;
+static DEFINE_SPINLOCK(snmp_lock);
+
+/* 
+ * Application layer address mapping mimics the NAT mapping, but 
+ * only for the first octet in this case (a more flexible system
+ * can be implemented if needed).
+ */
+struct oct1_map
+{
+	u_int8_t from;
+	u_int8_t to;
+};
+
+                                  
+/*****************************************************************************
+ *
+ * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* Class */
+#define ASN1_UNI	0	/* Universal */
+#define ASN1_APL	1	/* Application */
+#define ASN1_CTX	2	/* Context */
+#define ASN1_PRV	3	/* Private */
+
+/* Tag */
+#define ASN1_EOC	0	/* End Of Contents */
+#define ASN1_BOL	1	/* Boolean */
+#define ASN1_INT	2	/* Integer */
+#define ASN1_BTS	3	/* Bit String */
+#define ASN1_OTS	4	/* Octet String */
+#define ASN1_NUL	5	/* Null */
+#define ASN1_OJI	6	/* Object Identifier  */
+#define ASN1_OJD	7	/* Object Description */
+#define ASN1_EXT	8	/* External */
+#define ASN1_SEQ	16	/* Sequence */
+#define ASN1_SET	17	/* Set */
+#define ASN1_NUMSTR	18	/* Numerical String */
+#define ASN1_PRNSTR	19	/* Printable String */
+#define ASN1_TEXSTR	20	/* Teletext String */
+#define ASN1_VIDSTR	21	/* Video String */
+#define ASN1_IA5STR	22	/* IA5 String */
+#define ASN1_UNITIM	23	/* Universal Time */
+#define ASN1_GENTIM	24	/* General Time */
+#define ASN1_GRASTR	25	/* Graphical String */
+#define ASN1_VISSTR	26	/* Visible String */
+#define ASN1_GENSTR	27	/* General String */
+
+/* Primitive / Constructed methods*/
+#define ASN1_PRI	0	/* Primitive */
+#define ASN1_CON	1	/* Constructed */
+
+/*
+ * Error codes.
+ */
+#define ASN1_ERR_NOERROR		0
+#define ASN1_ERR_DEC_EMPTY		2
+#define ASN1_ERR_DEC_EOC_MISMATCH	3
+#define ASN1_ERR_DEC_LENGTH_MISMATCH	4
+#define ASN1_ERR_DEC_BADVALUE		5
+
+/* 
+ * ASN.1 context.
+ */
+struct asn1_ctx
+{
+	int error;			/* Error condition */
+	unsigned char *pointer;		/* Octet just to be decoded */
+	unsigned char *begin;		/* First octet */
+	unsigned char *end;		/* Octet after last octet */
+};
+
+/*
+ * Octet string (not null terminated)
+ */
+struct asn1_octstr
+{
+	unsigned char *data;
+	unsigned int len;
+};
+	
+static void asn1_open(struct asn1_ctx *ctx,
+                      unsigned char *buf,
+                      unsigned int len)
+{
+	ctx->begin = buf;
+	ctx->end = buf + len;
+	ctx->pointer = buf;
+	ctx->error = ASN1_ERR_NOERROR;
+}
+
+static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch)
+{
+	if (ctx->pointer >= ctx->end) {
+		ctx->error = ASN1_ERR_DEC_EMPTY;
+		return 0;
+	}
+	*ch = *(ctx->pointer)++;
+	return 1;
+}
+
+static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag)
+{
+	unsigned char ch;
+	
+	*tag = 0;
+	
+	do
+	{
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+		*tag <<= 7;
+		*tag |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static unsigned char asn1_id_decode(struct asn1_ctx *ctx, 
+                                    unsigned int *cls,
+                                    unsigned int *con,
+                                    unsigned int *tag)
+{
+	unsigned char ch;
+	
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+		
+	*cls = (ch & 0xC0) >> 6;
+	*con = (ch & 0x20) >> 5;
+	*tag = (ch & 0x1F);
+	
+	if (*tag == 0x1F) {
+		if (!asn1_tag_decode(ctx, tag))
+			return 0;
+	}
+	return 1;
+}
+
+static unsigned char asn1_length_decode(struct asn1_ctx *ctx,
+                                        unsigned int *def,
+                                        unsigned int *len)
+{
+	unsigned char ch, cnt;
+	
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+		
+	if (ch == 0x80)
+		*def = 0;
+	else {
+		*def = 1;
+		
+		if (ch < 0x80)
+			*len = ch;
+		else {
+			cnt = (unsigned char) (ch & 0x7F);
+			*len = 0;
+			
+			while (cnt > 0) {
+				if (!asn1_octet_decode(ctx, &ch))
+					return 0;
+				*len <<= 8;
+				*len |= ch;
+				cnt--;
+			}
+		}
+	}
+	return 1;
+}
+
+static unsigned char asn1_header_decode(struct asn1_ctx *ctx,
+                                        unsigned char **eoc,
+                                        unsigned int *cls,
+                                        unsigned int *con,
+                                        unsigned int *tag)
+{
+	unsigned int def, len;
+	
+	if (!asn1_id_decode(ctx, cls, con, tag))
+		return 0;
+		
+	if (!asn1_length_decode(ctx, &def, &len))
+		return 0;
+		
+	if (def)
+		*eoc = ctx->pointer + len;
+	else
+		*eoc = NULL;
+	return 1;
+}
+
+static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+	unsigned char ch;
+	
+	if (eoc == 0) {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+			
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+		
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+			
+		if (ch != 0x00) {
+			ctx->error = ASN1_ERR_DEC_EOC_MISMATCH;
+			return 0;
+		}
+		return 1;
+	} else {
+		if (ctx->pointer != eoc) {
+			ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH;
+			return 0;
+		}
+		return 1;
+	}
+}
+
+static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc)
+{
+	ctx->pointer = eoc;
+	return 1;
+}
+
+static unsigned char asn1_long_decode(struct asn1_ctx *ctx,
+                                      unsigned char *eoc,
+                                      long *integer)
+{
+	unsigned char ch;
+	unsigned int  len;
+	
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+		
+	*integer = (signed char) ch;
+	len = 1;
+	
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof (long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+		
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+			
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_uint_decode(struct asn1_ctx *ctx,
+                                      unsigned char *eoc,
+                                      unsigned int *integer)
+{
+	unsigned char ch;
+	unsigned int  len;
+	
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+		
+	*integer = ch;
+	if (ch == 0) len = 0;
+	else len = 1;
+	
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof (unsigned int)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+		
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+			
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx,
+                                       unsigned char *eoc,
+                                       unsigned long *integer)
+{
+	unsigned char ch;
+	unsigned int  len;
+	
+	if (!asn1_octet_decode(ctx, &ch))
+		return 0;
+		
+	*integer = ch;
+	if (ch == 0) len = 0;
+	else len = 1;
+	
+	while (ctx->pointer < eoc) {
+		if (++len > sizeof (unsigned long)) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			return 0;
+		}
+		
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+			
+		*integer <<= 8;
+		*integer |= ch;
+	}
+	return 1;
+}
+
+static unsigned char asn1_octets_decode(struct asn1_ctx *ctx,
+                                        unsigned char *eoc,
+                                        unsigned char **octets,
+                                        unsigned int *len)
+{
+	unsigned char *ptr;
+	
+	*len = 0;
+	
+	*octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC);
+	if (*octets == NULL) {
+		if (net_ratelimit())
+			printk("OOM in bsalg (%d)\n", __LINE__);
+		return 0;
+	}
+	
+	ptr = *octets;
+	while (ctx->pointer < eoc) {
+		if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) {
+			kfree(*octets);
+			*octets = NULL;
+			return 0;
+		}
+		(*len)++;
+	}
+	return 1;
+}
+
+static unsigned char asn1_subid_decode(struct asn1_ctx *ctx,
+                                       unsigned long *subid)
+{
+	unsigned char ch;
+	
+	*subid = 0;
+	
+	do {
+		if (!asn1_octet_decode(ctx, &ch))
+			return 0;
+		
+		*subid <<= 7;
+		*subid |= ch & 0x7F;
+	} while ((ch & 0x80) == 0x80);
+	return 1;
+}
+
+static unsigned char asn1_oid_decode(struct asn1_ctx *ctx,
+                                     unsigned char *eoc,
+                                     unsigned long **oid,
+                                     unsigned int *len)
+{
+	unsigned long subid;
+	unsigned int  size;
+	unsigned long *optr;
+	
+	size = eoc - ctx->pointer + 1;
+	*oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC);
+	if (*oid == NULL) {
+		if (net_ratelimit())
+			printk("OOM in bsalg (%d)\n", __LINE__);
+		return 0;
+	}
+	
+	optr = *oid;
+	
+	if (!asn1_subid_decode(ctx, &subid)) {
+		kfree(*oid);
+		*oid = NULL;
+		return 0;
+	}
+	
+	if (subid < 40) {
+		optr [0] = 0;
+		optr [1] = subid;
+	} else if (subid < 80) {
+		optr [0] = 1;
+		optr [1] = subid - 40;
+	} else {
+		optr [0] = 2;
+		optr [1] = subid - 80;
+	}
+	
+	*len = 2;
+	optr += 2;
+	
+	while (ctx->pointer < eoc) {
+		if (++(*len) > size) {
+			ctx->error = ASN1_ERR_DEC_BADVALUE;
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+		
+		if (!asn1_subid_decode(ctx, optr++)) {
+			kfree(*oid);
+			*oid = NULL;
+			return 0;
+		}
+	}
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * SNMP decoding routines (gxsnmp author Dirk Wisse)
+ *
+ *****************************************************************************/
+
+/* SNMP Versions */
+#define SNMP_V1				0
+#define SNMP_V2C			1
+#define SNMP_V2				2
+#define SNMP_V3				3
+
+/* Default Sizes */
+#define SNMP_SIZE_COMM			256
+#define SNMP_SIZE_OBJECTID		128
+#define SNMP_SIZE_BUFCHR		256
+#define SNMP_SIZE_BUFINT		128
+#define SNMP_SIZE_SMALLOBJECTID		16
+
+/* Requests */
+#define SNMP_PDU_GET			0
+#define SNMP_PDU_NEXT			1
+#define SNMP_PDU_RESPONSE		2
+#define SNMP_PDU_SET			3
+#define SNMP_PDU_TRAP1			4
+#define SNMP_PDU_BULK			5
+#define SNMP_PDU_INFORM			6
+#define SNMP_PDU_TRAP2			7
+
+/* Errors */
+#define SNMP_NOERROR			0
+#define SNMP_TOOBIG			1
+#define SNMP_NOSUCHNAME			2
+#define SNMP_BADVALUE			3
+#define SNMP_READONLY			4
+#define SNMP_GENERROR			5
+#define SNMP_NOACCESS			6
+#define SNMP_WRONGTYPE			7
+#define SNMP_WRONGLENGTH		8
+#define SNMP_WRONGENCODING		9
+#define SNMP_WRONGVALUE			10
+#define SNMP_NOCREATION			11
+#define SNMP_INCONSISTENTVALUE		12
+#define SNMP_RESOURCEUNAVAILABLE	13
+#define SNMP_COMMITFAILED		14
+#define SNMP_UNDOFAILED			15
+#define SNMP_AUTHORIZATIONERROR		16
+#define SNMP_NOTWRITABLE		17
+#define SNMP_INCONSISTENTNAME		18
+
+/* General SNMP V1 Traps */
+#define SNMP_TRAP_COLDSTART		0
+#define SNMP_TRAP_WARMSTART		1
+#define SNMP_TRAP_LINKDOWN		2
+#define SNMP_TRAP_LINKUP		3
+#define SNMP_TRAP_AUTFAILURE		4
+#define SNMP_TRAP_EQPNEIGHBORLOSS	5
+#define SNMP_TRAP_ENTSPECIFIC		6
+
+/* SNMPv1 Types */
+#define SNMP_NULL                0
+#define SNMP_INTEGER             1    /* l  */
+#define SNMP_OCTETSTR            2    /* c  */
+#define SNMP_DISPLAYSTR          2    /* c  */
+#define SNMP_OBJECTID            3    /* ul */
+#define SNMP_IPADDR              4    /* uc */
+#define SNMP_COUNTER             5    /* ul */
+#define SNMP_GAUGE               6    /* ul */
+#define SNMP_TIMETICKS           7    /* ul */
+#define SNMP_OPAQUE              8    /* c  */
+
+/* Additional SNMPv2 Types */
+#define SNMP_UINTEGER            5    /* ul */
+#define SNMP_BITSTR              9    /* uc */
+#define SNMP_NSAP               10    /* uc */
+#define SNMP_COUNTER64          11    /* ul */
+#define SNMP_NOSUCHOBJECT       12
+#define SNMP_NOSUCHINSTANCE     13
+#define SNMP_ENDOFMIBVIEW       14
+
+union snmp_syntax
+{
+	unsigned char uc[0];	/* 8 bit unsigned */
+	char c[0];		/* 8 bit signed */
+	unsigned long ul[0];	/* 32 bit unsigned */
+	long l[0];		/* 32 bit signed */
+};
+
+struct snmp_object
+{
+	unsigned long *id;
+	unsigned int id_len;
+	unsigned short type;
+	unsigned int syntax_len;
+	union snmp_syntax syntax;
+};
+
+struct snmp_request
+{
+	unsigned long id;
+	unsigned int error_status;
+	unsigned int error_index;
+};
+
+struct snmp_v1_trap
+{
+	unsigned long *id;
+	unsigned int id_len;
+	unsigned long ip_address;	/* pointer  */
+	unsigned int general;
+	unsigned int specific;
+	unsigned long time;
+};
+
+/* SNMP types */
+#define SNMP_IPA    0
+#define SNMP_CNT    1
+#define SNMP_GGE    2
+#define SNMP_TIT    3
+#define SNMP_OPQ    4
+#define SNMP_C64    6
+
+/* SNMP errors */
+#define SERR_NSO    0
+#define SERR_NSI    1
+#define SERR_EOM    2
+
+static inline void mangle_address(unsigned char *begin,
+                                  unsigned char *addr,
+                                  const struct oct1_map *map,
+                                  u_int16_t *check);
+struct snmp_cnv
+{
+	unsigned int class;
+	unsigned int tag;
+	int syntax;
+};
+
+static struct snmp_cnv snmp_conv [] =
+{
+	{ASN1_UNI, ASN1_NUL, SNMP_NULL},
+	{ASN1_UNI, ASN1_INT, SNMP_INTEGER},
+	{ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR},
+	{ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR},
+	{ASN1_UNI, ASN1_OJI, SNMP_OBJECTID},
+	{ASN1_APL, SNMP_IPA, SNMP_IPADDR},
+	{ASN1_APL, SNMP_CNT, SNMP_COUNTER},	/* Counter32 */
+	{ASN1_APL, SNMP_GGE, SNMP_GAUGE},	/* Gauge32 == Unsigned32  */
+	{ASN1_APL, SNMP_TIT, SNMP_TIMETICKS},
+	{ASN1_APL, SNMP_OPQ, SNMP_OPAQUE},
+	
+	/* SNMPv2 data types and errors */
+	{ASN1_UNI, ASN1_BTS, SNMP_BITSTR},
+	{ASN1_APL, SNMP_C64, SNMP_COUNTER64},
+	{ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT},
+	{ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE},
+	{ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW},
+	{0,       0,       -1}
+};
+
+static unsigned char snmp_tag_cls2syntax(unsigned int tag,
+                                         unsigned int cls,
+                                         unsigned short *syntax)
+{
+	struct snmp_cnv *cnv;
+	
+	cnv = snmp_conv;
+	
+	while (cnv->syntax != -1) {
+		if (cnv->tag == tag && cnv->class == cls) {
+			*syntax = cnv->syntax;
+			return 1;
+		}
+		cnv++;
+	}
+	return 0;
+}
+
+static unsigned char snmp_object_decode(struct asn1_ctx *ctx,
+                                        struct snmp_object **obj)
+{
+	unsigned int cls, con, tag, len, idlen;
+	unsigned short type;
+	unsigned char *eoc, *end, *p;
+	unsigned long *lp, *id;
+	unsigned long ul;
+	long  l;
+	
+	*obj = NULL;
+	id = NULL;
+	
+	if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag))
+		return 0;
+		
+	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+		return 0;
+	
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+	
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+		return 0;
+	
+	if (!asn1_oid_decode(ctx, end, &id, &idlen))
+		return 0;
+		
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) {
+		kfree(id);
+		return 0;
+	}
+	
+	if (con != ASN1_PRI) {
+		kfree(id);
+		return 0;
+	}
+	
+	if (!snmp_tag_cls2syntax(tag, cls, &type)) {
+		kfree(id);
+		return 0;
+	}
+	
+	switch (type) {
+		case SNMP_INTEGER:
+			len = sizeof(long);
+			if (!asn1_long_decode(ctx, end, &l)) {
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len,
+			               GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					printk("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			(*obj)->syntax.l[0] = l;
+			break;
+		case SNMP_OCTETSTR:
+		case SNMP_OPAQUE:
+			if (!asn1_octets_decode(ctx, end, &p, &len)) {
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len,
+			               GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					printk("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			memcpy((*obj)->syntax.c, p, len);
+			kfree(p);
+			break;
+		case SNMP_NULL:
+		case SNMP_NOSUCHOBJECT:
+		case SNMP_NOSUCHINSTANCE:
+		case SNMP_ENDOFMIBVIEW:
+			len = 0;
+			*obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					printk("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			if (!asn1_null_decode(ctx, end)) {
+				kfree(id);
+				kfree(*obj);
+				*obj = NULL;
+				return 0;
+			}
+			break;
+		case SNMP_OBJECTID:
+			if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) {
+				kfree(id);
+				return 0;
+			}
+			len *= sizeof(unsigned long);
+			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					printk("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			memcpy((*obj)->syntax.ul, lp, len);
+			kfree(lp);
+			break;
+		case SNMP_IPADDR:
+			if (!asn1_octets_decode(ctx, end, &p, &len)) {
+				kfree(id);
+				return 0;
+			}
+			if (len != 4) {
+				kfree(p);
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(p);
+				kfree(id);
+				if (net_ratelimit())
+					printk("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			memcpy((*obj)->syntax.uc, p, len);
+			kfree(p);
+			break;
+		case SNMP_COUNTER:
+		case SNMP_GAUGE:
+		case SNMP_TIMETICKS:
+			len = sizeof(unsigned long);
+			if (!asn1_ulong_decode(ctx, end, &ul)) {
+				kfree(id);
+				return 0;
+			}
+			*obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC);
+			if (*obj == NULL) {
+				kfree(id);
+				if (net_ratelimit())
+					printk("OOM in bsalg (%d)\n", __LINE__);
+				return 0;
+			}
+			(*obj)->syntax.ul[0] = ul;
+			break;
+		default:
+			kfree(id);
+			return 0;
+	}
+	
+	(*obj)->syntax_len = len;
+	(*obj)->type = type;
+	(*obj)->id = id;
+	(*obj)->id_len = idlen;
+	
+	if (!asn1_eoc_decode(ctx, eoc)) {
+		kfree(id);
+		kfree(*obj);
+		*obj = NULL;
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned char snmp_request_decode(struct asn1_ctx *ctx,
+                                         struct snmp_request *request)
+{
+	unsigned int cls, con, tag;
+	unsigned char *end;
+	
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+		
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+		
+	if (!asn1_ulong_decode(ctx, end, &request->id))
+		return 0;
+		
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+		
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+		
+	if (!asn1_uint_decode(ctx, end, &request->error_status))
+		return 0;
+		
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+		
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+		
+	if (!asn1_uint_decode(ctx, end, &request->error_index))
+		return 0;
+	
+	return 1;
+}
+
+/* 
+ * Fast checksum update for possibly oddly-aligned UDP byte, from the
+ * code example in the draft.
+ */
+static void fast_csum(unsigned char *csum,
+                      const unsigned char *optr,
+                      const unsigned char *nptr,
+                      int odd)
+{
+	long x, old, new;
+	
+	x = csum[0] * 256 + csum[1];
+	
+	x =~ x & 0xFFFF;
+	
+	if (odd) old = optr[0] * 256;
+	else old = optr[0];
+	
+	x -= old & 0xFFFF;
+	if (x <= 0) {
+		x--;
+		x &= 0xFFFF;
+	}
+	
+	if (odd) new = nptr[0] * 256;
+	else new = nptr[0];
+	
+	x += new & 0xFFFF;
+	if (x & 0x10000) {
+		x++;
+		x &= 0xFFFF;
+	}
+	
+	x =~ x & 0xFFFF;
+	csum[0] = x / 256;
+	csum[1] = x & 0xFF;
+}
+
+/* 
+ * Mangle IP address.
+ * 	- begin points to the start of the snmp messgae
+ *      - addr points to the start of the address
+ */
+static inline void mangle_address(unsigned char *begin,
+                                  unsigned char *addr,
+                                  const struct oct1_map *map,
+                                  u_int16_t *check)
+{
+	if (map->from == NOCT1(*addr)) {
+		u_int32_t old;
+		
+		if (debug)
+			memcpy(&old, (unsigned char *)addr, sizeof(old));
+			
+		*addr = map->to;
+		
+		/* Update UDP checksum if being used */
+		if (*check) {
+			unsigned char odd = !((addr - begin) % 2);
+			
+			fast_csum((unsigned char *)check,
+			          &map->from, &map->to, odd);
+			          
+		}
+		
+		if (debug)
+			printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to "
+			       "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr));
+	}
+}
+
+static unsigned char snmp_trap_decode(struct asn1_ctx *ctx,
+                                      struct snmp_v1_trap *trap,
+                                      const struct oct1_map *map,
+                                      u_int16_t *check)
+{
+	unsigned int cls, con, tag, len;
+	unsigned char *end;
+
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		return 0;
+		
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI)
+		return 0;
+	
+	if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len))
+		return 0;
+		
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_id_free;
+
+	if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) ||
+	      (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS)))
+		goto err_id_free;
+	
+	if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len))
+		goto err_id_free;
+	
+	/* IPv4 only */
+	if (len != 4)
+		goto err_addr_free;
+	
+	mangle_address(ctx->begin, ctx->pointer - 4, map, check);
+	
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_addr_free;
+		
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		goto err_addr_free;
+		
+	if (!asn1_uint_decode(ctx, end, &trap->general))
+		goto err_addr_free;
+		
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_addr_free;
+	
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		goto err_addr_free;
+		
+	if (!asn1_uint_decode(ctx, end, &trap->specific))
+		goto err_addr_free;
+		
+	if (!asn1_header_decode(ctx, &end, &cls, &con, &tag))
+		goto err_addr_free;
+		
+	if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) ||
+	      (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT)))
+		goto err_addr_free;
+		
+	if (!asn1_ulong_decode(ctx, end, &trap->time))
+		goto err_addr_free;
+		
+	return 1;
+
+err_id_free:
+	kfree(trap->id);
+
+err_addr_free:
+	kfree((unsigned long *)trap->ip_address);
+	
+	return 0;
+}
+
+/*****************************************************************************
+ *
+ * Misc. routines
+ *
+ *****************************************************************************/
+
+static void hex_dump(unsigned char *buf, size_t len)
+{
+	size_t i;
+	
+	for (i = 0; i < len; i++) {
+		if (i && !(i % 16))
+			printk("\n");
+		printk("%02x ", *(buf + i));
+	}
+	printk("\n");
+}
+
+/*
+ * Parse and mangle SNMP message according to mapping.
+ * (And this is the fucking 'basic' method).
+ */
+static int snmp_parse_mangle(unsigned char *msg,
+                             u_int16_t len,
+                             const struct oct1_map *map,
+                             u_int16_t *check)
+{
+	unsigned char *eoc, *end;
+	unsigned int cls, con, tag, vers, pdutype;
+	struct asn1_ctx ctx;
+	struct asn1_octstr comm;
+	struct snmp_object **obj;
+	
+	if (debug > 1)
+		hex_dump(msg, len);
+
+	asn1_open(&ctx, msg, len);
+	
+	/* 
+	 * Start of SNMP message.
+	 */
+	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+		return 0;
+	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+		return 0;
+	
+	/* 
+	 * Version 1 or 2 handled.
+	 */
+	if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag))
+		return 0;
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT)
+		return 0;
+	if (!asn1_uint_decode (&ctx, end, &vers))
+		return 0;
+	if (debug > 1)
+		printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+	if (vers > 1)
+		return 1;
+	
+	/*
+	 * Community.
+	 */
+	if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag))
+		return 0;
+	if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS)
+		return 0;
+	if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len))
+		return 0;
+	if (debug > 1) {
+		unsigned int i;
+		
+		printk(KERN_DEBUG "bsalg: community: ");
+		for (i = 0; i < comm.len; i++)
+			printk("%c", comm.data[i]);
+		printk("\n");
+	}
+	kfree(comm.data);
+	
+	/*
+	 * PDU type
+	 */
+	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype))
+		return 0;
+	if (cls != ASN1_CTX || con != ASN1_CON)
+		return 0;
+	if (debug > 1) {
+		unsigned char *pdus[] = {
+			[SNMP_PDU_GET] = "get",
+			[SNMP_PDU_NEXT] = "get-next",
+			[SNMP_PDU_RESPONSE] = "response",
+			[SNMP_PDU_SET] = "set",
+			[SNMP_PDU_TRAP1] = "trapv1",
+			[SNMP_PDU_BULK] = "bulk",
+			[SNMP_PDU_INFORM] = "inform",
+			[SNMP_PDU_TRAP2] = "trapv2"
+		};
+		
+		if (pdutype > SNMP_PDU_TRAP2)
+			printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+		else
+			printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+	}
+	if (pdutype != SNMP_PDU_RESPONSE &&
+	    pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
+		return 1;
+	
+	/*
+	 * Request header or v1 trap
+	 */
+	if (pdutype == SNMP_PDU_TRAP1) {
+		struct snmp_v1_trap trap;
+		unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check);
+		
+		/* Discard trap allocations regardless */
+		kfree(trap.id);
+		kfree((unsigned long *)trap.ip_address);
+		
+		if (!ret)
+			return ret;
+		
+	} else {
+		struct snmp_request req;
+		
+		if (!snmp_request_decode(&ctx, &req))
+			return 0;
+			
+		if (debug > 1)
+			printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+			"error_index=%u\n", req.id, req.error_status,
+			req.error_index);
+	}
+	
+	/*
+	 * Loop through objects, look for IP addresses to mangle.
+	 */
+	if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag))
+		return 0;
+		
+	if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
+		return 0;
+	
+	obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
+	if (obj == NULL) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
+		return 0;	
+	}
+
+	while (!asn1_eoc_decode(&ctx, eoc)) {
+		unsigned int i;
+		
+		if (!snmp_object_decode(&ctx, obj)) {
+			if (*obj) {
+				if ((*obj)->id)
+					kfree((*obj)->id);
+				kfree(*obj);
+			}	
+			kfree(obj);
+			return 0;
+		}
+
+		if (debug > 1) {
+			printk(KERN_DEBUG "bsalg: object: ");
+			for (i = 0; i < (*obj)->id_len; i++) {
+				if (i > 0)
+					printk(".");
+				printk("%lu", (*obj)->id[i]);
+			}
+			printk(": type=%u\n", (*obj)->type);
+			
+		}
+
+		if ((*obj)->type == SNMP_IPADDR)
+			mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
+		
+		kfree((*obj)->id);
+		kfree(*obj);
+	}
+	kfree(obj);
+	
+	if (!asn1_eoc_decode(&ctx, eoc))
+		return 0;
+		
+	return 1;
+}
+
+/*****************************************************************************
+ *
+ * NAT routines.
+ *
+ *****************************************************************************/
+
+/* 
+ * SNMP translation routine.
+ */
+static int snmp_translate(struct ip_conntrack *ct,
+                          enum ip_conntrack_info ctinfo,
+                          struct sk_buff **pskb)
+{
+	struct iphdr *iph = (*pskb)->nh.iph;
+	struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+	u_int16_t udplen = ntohs(udph->len);
+	u_int16_t paylen = udplen - sizeof(struct udphdr);
+	int dir = CTINFO2DIR(ctinfo);
+	struct oct1_map map;
+
+	/*
+	 * Determine mappping for application layer addresses based
+	 * on NAT manipulations for the packet.
+	 */
+	if (dir == IP_CT_DIR_ORIGINAL) {
+		/* SNAT traps */
+		map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip);
+		map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip);
+	} else {
+		/* DNAT replies */
+		map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip);
+		map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip);
+	}
+	
+	if (map.from == map.to)
+		return NF_ACCEPT;
+	
+	if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr),
+	                       paylen, &map, &udph->check)) {
+		if (net_ratelimit())
+			printk(KERN_WARNING "bsalg: parser failed\n");
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+/* We don't actually set up expectations, just adjust internal IP
+ * addresses if this is being NATted */
+static int help(struct sk_buff **pskb,
+		struct ip_conntrack *ct,
+		enum ip_conntrack_info ctinfo)
+{
+	int dir = CTINFO2DIR(ctinfo);
+	unsigned int ret;
+	struct iphdr *iph = (*pskb)->nh.iph;
+	struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl);
+
+	/* SNMP replies and originating SNMP traps get mangled */
+	if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY)
+		return NF_ACCEPT;
+	if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL)
+		return NF_ACCEPT;
+
+	/* No NAT? */
+	if (!(ct->status & IPS_NAT_MASK))
+		return NF_ACCEPT;
+
+	/* 
+	 * Make sure the packet length is ok.  So far, we were only guaranteed
+	 * to have a valid length IP header plus 8 bytes, which means we have
+	 * enough room for a UDP header.  Just verify the UDP length field so we
+	 * can mess around with the payload.
+	 */
+	if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) {
+		 if (net_ratelimit())
+			 printk(KERN_WARNING "SNMP: dropping malformed packet "
+				"src=%u.%u.%u.%u dst=%u.%u.%u.%u\n",
+				NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
+		 return NF_DROP;
+	}
+
+	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+		return NF_DROP;
+
+	spin_lock_bh(&snmp_lock);
+	ret = snmp_translate(ct, ctinfo, pskb);
+	spin_unlock_bh(&snmp_lock);
+	return ret;
+}
+
+static struct ip_conntrack_helper snmp_helper = {
+	.max_expected = 0,
+	.timeout = 180,
+	.me = THIS_MODULE,
+	.help = help,
+	.name = "snmp",
+
+	.tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } },
+		   .dst = { .protonum = IPPROTO_UDP },
+	},
+	.mask = { .src = { .u = { 0xFFFF } },
+		 .dst = { .protonum = 0xFF },
+	},
+};
+
+static struct ip_conntrack_helper snmp_trap_helper = {
+	.max_expected = 0,
+	.timeout = 180,
+	.me = THIS_MODULE,
+	.help = help,
+	.name = "snmp_trap",
+
+	.tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } },
+		   .dst = { .protonum = IPPROTO_UDP },
+	},
+	.mask = { .src = { .u = { 0xFFFF } },
+		 .dst = { .protonum = 0xFF },
+	},
+};
+
+/*****************************************************************************
+ *
+ * Module stuff.
+ *
+ *****************************************************************************/
+ 
+static int __init init(void)
+{
+	int ret = 0;
+
+	ret = ip_conntrack_helper_register(&snmp_helper);
+	if (ret < 0)
+		return ret;
+	ret = ip_conntrack_helper_register(&snmp_trap_helper);
+	if (ret < 0) {
+		ip_conntrack_helper_unregister(&snmp_helper);
+		return ret;
+	}
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ip_conntrack_helper_unregister(&snmp_helper);
+	ip_conntrack_helper_unregister(&snmp_trap_helper);
+}
+
+module_init(init);
+module_exit(fini);
+
+module_param(debug, bool, 0600);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
new file mode 100644
index 000000000000..dec4a74212cd
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -0,0 +1,349 @@
+/* This file contains all the functions required for the standalone
+   ip_nat module.
+
+   These are not required by the compatibility layer.
+*/
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
+ * 	- new API and handling of conntrack/nat helpers
+ * 	- now capable of multiple expectations for one master
+ * */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/icmp.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/spinlock.h>
+
+#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
+#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock)
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING"  \
+			   : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \
+			      : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT"  \
+			         : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN"  \
+				    : "*ERROR*")))
+
+static unsigned int
+ip_nat_fn(unsigned int hooknum,
+	  struct sk_buff **pskb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  int (*okfn)(struct sk_buff *))
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	struct ip_nat_info *info;
+	/* maniptype == SRC for postrouting. */
+	enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum);
+
+	/* We never see fragments: conntrack defrags on pre-routing
+	   and local-out, and ip_nat_out protects post-routing. */
+	IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off
+		       & htons(IP_MF|IP_OFFSET)));
+
+	(*pskb)->nfcache |= NFC_UNKNOWN;
+
+	/* If we had a hardware checksum before, it's now invalid */
+	if ((*pskb)->ip_summed == CHECKSUM_HW)
+		if (skb_checksum_help(*pskb, (out == NULL)))
+			return NF_DROP;
+
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+	/* Can't track?  It's not due to stress, or conntrack would
+	   have dropped it.  Hence it's the user's responsibilty to
+	   packet filter it out, or implement conntrack/NAT for that
+	   protocol. 8) --RR */
+	if (!ct) {
+		/* Exception: ICMP redirect to new connection (not in
+                   hash table yet).  We must not let this through, in
+                   case we're doing NAT to the same network. */
+		if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+			struct icmphdr _hdr, *hp;
+
+			hp = skb_header_pointer(*pskb,
+						(*pskb)->nh.iph->ihl*4,
+						sizeof(_hdr), &_hdr);
+			if (hp != NULL &&
+			    hp->type == ICMP_REDIRECT)
+				return NF_DROP;
+		}
+		return NF_ACCEPT;
+	}
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED+IP_CT_IS_REPLY:
+		if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
+			if (!icmp_reply_translation(pskb, ct, maniptype,
+						    CTINFO2DIR(ctinfo)))
+				return NF_DROP;
+			else
+				return NF_ACCEPT;
+		}
+		/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
+	case IP_CT_NEW:
+		info = &ct->nat.info;
+
+		/* Seen it before?  This can happen for loopback, retrans,
+		   or local packets.. */
+		if (!ip_nat_initialized(ct, maniptype)) {
+			unsigned int ret;
+
+			/* LOCAL_IN hook doesn't have a chain!  */
+			if (hooknum == NF_IP_LOCAL_IN)
+				ret = alloc_null_binding(ct, info, hooknum);
+			else
+				ret = ip_nat_rule_find(pskb, hooknum,
+						       in, out, ct,
+						       info);
+
+			if (ret != NF_ACCEPT) {
+				return ret;
+			}
+		} else
+			DEBUGP("Already setup manip %s for ct %p\n",
+			       maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
+			       ct);
+		break;
+
+	default:
+		/* ESTABLISHED */
+		IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED
+			     || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
+		info = &ct->nat.info;
+	}
+
+	IP_NF_ASSERT(info);
+	return nat_packet(ct, ctinfo, hooknum, pskb);
+}
+
+static unsigned int
+ip_nat_in(unsigned int hooknum,
+          struct sk_buff **pskb,
+          const struct net_device *in,
+          const struct net_device *out,
+          int (*okfn)(struct sk_buff *))
+{
+	u_int32_t saddr, daddr;
+	unsigned int ret;
+
+	saddr = (*pskb)->nh.iph->saddr;
+	daddr = (*pskb)->nh.iph->daddr;
+
+	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN
+	    && ((*pskb)->nh.iph->saddr != saddr
+	        || (*pskb)->nh.iph->daddr != daddr)) {
+		dst_release((*pskb)->dst);
+		(*pskb)->dst = NULL;
+	}
+	return ret;
+}
+
+static unsigned int
+ip_nat_out(unsigned int hooknum,
+	   struct sk_buff **pskb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
+{
+	/* root is playing with raw sockets. */
+	if ((*pskb)->len < sizeof(struct iphdr)
+	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	/* We can hit fragment here; forwarded packets get
+	   defragmented by connection tracking coming in, then
+	   fragmented (grr) by the forward code.
+
+	   In future: If we have nfct != NULL, AND we have NAT
+	   initialized, AND there is no helper, then we can do full
+	   NAPT on the head, and IP-address-only NAT on the rest.
+
+	   I'm starting to have nightmares about fragments.  */
+
+	if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) {
+		*pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT);
+
+		if (!*pskb)
+			return NF_STOLEN;
+	}
+
+	return ip_nat_fn(hooknum, pskb, in, out, okfn);
+}
+
+static unsigned int
+ip_nat_local_fn(unsigned int hooknum,
+		struct sk_buff **pskb,
+		const struct net_device *in,
+		const struct net_device *out,
+		int (*okfn)(struct sk_buff *))
+{
+	u_int32_t saddr, daddr;
+	unsigned int ret;
+
+	/* root is playing with raw sockets. */
+	if ((*pskb)->len < sizeof(struct iphdr)
+	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
+		return NF_ACCEPT;
+
+	saddr = (*pskb)->nh.iph->saddr;
+	daddr = (*pskb)->nh.iph->daddr;
+
+	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN
+	    && ((*pskb)->nh.iph->saddr != saddr
+		|| (*pskb)->nh.iph->daddr != daddr))
+		return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	return ret;
+}
+
+/* We must be after connection tracking and before packet filtering. */
+
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_in_ops = {
+	.hook		= ip_nat_in,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_PRE_ROUTING,
+	.priority	= NF_IP_PRI_NAT_DST,
+};
+
+/* After packet filtering, change source */
+static struct nf_hook_ops ip_nat_out_ops = {
+	.hook		= ip_nat_out,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_POST_ROUTING,
+	.priority	= NF_IP_PRI_NAT_SRC,
+};
+
+/* Before packet filtering, change destination */
+static struct nf_hook_ops ip_nat_local_out_ops = {
+	.hook		= ip_nat_local_fn,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_LOCAL_OUT,
+	.priority	= NF_IP_PRI_NAT_DST,
+};
+
+/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */
+static struct nf_hook_ops ip_nat_local_in_ops = {
+	.hook		= ip_nat_fn,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_IP_LOCAL_IN,
+	.priority	= NF_IP_PRI_NAT_SRC,
+};
+
+static int init_or_cleanup(int init)
+{
+	int ret = 0;
+
+	need_ip_conntrack();
+
+	if (!init) goto cleanup;
+
+	ret = ip_nat_rule_init();
+	if (ret < 0) {
+		printk("ip_nat_init: can't setup rules.\n");
+		goto cleanup_nothing;
+	}
+	ret = ip_nat_init();
+	if (ret < 0) {
+		printk("ip_nat_init: can't setup rules.\n");
+		goto cleanup_rule_init;
+	}
+	ret = nf_register_hook(&ip_nat_in_ops);
+	if (ret < 0) {
+		printk("ip_nat_init: can't register in hook.\n");
+		goto cleanup_nat;
+	}
+	ret = nf_register_hook(&ip_nat_out_ops);
+	if (ret < 0) {
+		printk("ip_nat_init: can't register out hook.\n");
+		goto cleanup_inops;
+	}
+	ret = nf_register_hook(&ip_nat_local_out_ops);
+	if (ret < 0) {
+		printk("ip_nat_init: can't register local out hook.\n");
+		goto cleanup_outops;
+	}
+	ret = nf_register_hook(&ip_nat_local_in_ops);
+	if (ret < 0) {
+		printk("ip_nat_init: can't register local in hook.\n");
+		goto cleanup_localoutops;
+	}
+	return ret;
+
+ cleanup:
+	nf_unregister_hook(&ip_nat_local_in_ops);
+ cleanup_localoutops:
+	nf_unregister_hook(&ip_nat_local_out_ops);
+ cleanup_outops:
+	nf_unregister_hook(&ip_nat_out_ops);
+ cleanup_inops:
+	nf_unregister_hook(&ip_nat_in_ops);
+ cleanup_nat:
+	ip_nat_cleanup();
+ cleanup_rule_init:
+	ip_nat_rule_cleanup();
+ cleanup_nothing:
+	MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock);
+	return ret;
+}
+
+static int __init init(void)
+{
+	return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(0);
+}
+
+module_init(init);
+module_exit(fini);
+
+EXPORT_SYMBOL(ip_nat_setup_info);
+EXPORT_SYMBOL(ip_nat_protocol_register);
+EXPORT_SYMBOL(ip_nat_protocol_unregister);
+EXPORT_SYMBOL(ip_nat_cheat_check);
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
+EXPORT_SYMBOL(ip_nat_used_tuple);
+EXPORT_SYMBOL(ip_nat_follow_master);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c
new file mode 100644
index 000000000000..0343e0d64674
--- /dev/null
+++ b/net/ipv4/netfilter/ip_nat_tftp.c
@@ -0,0 +1,70 @@
+/* (C) 2001-2002 Magnus Boden <mb@ozaba.mine.nu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Version: 0.0.7
+ *
+ * Thu 21 Mar 2002 Harald Welte <laforge@gnumonks.org>
+ * 	- Port to newnat API
+ *
+ * This module currently supports DNAT:
+ * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y
+ *
+ * and SNAT:
+ * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x }
+ *
+ * It has not been tested with
+ * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip
+ * If you do test this please let me know if it works or not.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_conntrack_tftp.h>
+#include <linux/netfilter_ipv4/ip_nat_helper.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/moduleparam.h>
+
+MODULE_AUTHOR("Magnus Boden <mb@ozaba.mine.nu>");
+MODULE_DESCRIPTION("tftp NAT helper");
+MODULE_LICENSE("GPL");
+
+static unsigned int help(struct sk_buff **pskb,
+			 enum ip_conntrack_info ctinfo,
+			 struct ip_conntrack_expect *exp)
+{
+	exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port;
+	exp->dir = IP_CT_DIR_REPLY;
+	exp->expectfn = ip_nat_follow_master;
+	if (ip_conntrack_expect_related(exp) != 0) {
+		ip_conntrack_expect_free(exp);
+		return NF_DROP;
+	}
+	return NF_ACCEPT;
+}
+
+static void __exit fini(void)
+{
+	ip_nat_tftp_hook = NULL;
+	/* Make sure noone calls it, meanwhile. */
+	synchronize_net();
+}
+
+static int __init init(void)
+{
+	BUG_ON(ip_nat_tftp_hook);
+	ip_nat_tftp_hook = help;
+	return 0;
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
new file mode 100644
index 000000000000..9e40dffc204f
--- /dev/null
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -0,0 +1,741 @@
+/*
+ * This is a module which is used for queueing IPv4 packets and
+ * communicating with userspace via netlink.
+ *
+ * (C) 2000-2002 James Morris <jmorris@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 2000-03-27: Simplified code (thanks to Andi Kleen for clues).
+ * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report).
+ * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian 
+ *             Zander).
+ * 2000-08-01: Added Nick Williams' MAC support.
+ * 2002-06-25: Code cleanup.
+ * 2005-01-10: Added /proc counter for dropped packets; fixed so
+ *             packets aren't delivered to user space if they're going 
+ *             to be dropped. 
+ *
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <linux/ip.h>
+#include <linux/notifier.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_queue.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netlink.h>
+#include <linux/spinlock.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <net/sock.h>
+#include <net/route.h>
+
+#define IPQ_QMAX_DEFAULT 1024
+#define IPQ_PROC_FS_NAME "ip_queue"
+#define NET_IPQ_QMAX 2088
+#define NET_IPQ_QMAX_NAME "ip_queue_maxlen"
+
+struct ipq_rt_info {
+	__u8 tos;
+	__u32 daddr;
+	__u32 saddr;
+};
+
+struct ipq_queue_entry {
+	struct list_head list;
+	struct nf_info *info;
+	struct sk_buff *skb;
+	struct ipq_rt_info rt_info;
+};
+
+typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long);
+
+static unsigned char copy_mode = IPQ_COPY_NONE;
+static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT;
+static DEFINE_RWLOCK(queue_lock);
+static int peer_pid;
+static unsigned int copy_range;
+static unsigned int queue_total;
+static unsigned int queue_dropped = 0;
+static unsigned int queue_user_dropped = 0;
+static struct sock *ipqnl;
+static LIST_HEAD(queue_list);
+static DECLARE_MUTEX(ipqnl_sem);
+
+static void
+ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict)
+{
+	nf_reinject(entry->skb, entry->info, verdict);
+	kfree(entry);
+}
+
+static inline void
+__ipq_enqueue_entry(struct ipq_queue_entry *entry)
+{
+       list_add(&entry->list, &queue_list);
+       queue_total++;
+}
+
+/*
+ * Find and return a queued entry matched by cmpfn, or return the last
+ * entry if cmpfn is NULL.
+ */
+static inline struct ipq_queue_entry *
+__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct list_head *p;
+
+	list_for_each_prev(p, &queue_list) {
+		struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p;
+		
+		if (!cmpfn || cmpfn(entry, data))
+			return entry;
+	}
+	return NULL;
+}
+
+static inline void
+__ipq_dequeue_entry(struct ipq_queue_entry *entry)
+{
+	list_del(&entry->list);
+	queue_total--;
+}
+
+static inline struct ipq_queue_entry *
+__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct ipq_queue_entry *entry;
+
+	entry = __ipq_find_entry(cmpfn, data);
+	if (entry == NULL)
+		return NULL;
+
+	__ipq_dequeue_entry(entry);
+	return entry;
+}
+
+
+static inline void
+__ipq_flush(int verdict)
+{
+	struct ipq_queue_entry *entry;
+	
+	while ((entry = __ipq_find_dequeue_entry(NULL, 0)))
+		ipq_issue_verdict(entry, verdict);
+}
+
+static inline int
+__ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status = 0;
+	
+	switch(mode) {
+	case IPQ_COPY_NONE:
+	case IPQ_COPY_META:
+		copy_mode = mode;
+		copy_range = 0;
+		break;
+		
+	case IPQ_COPY_PACKET:
+		copy_mode = mode;
+		copy_range = range;
+		if (copy_range > 0xFFFF)
+			copy_range = 0xFFFF;
+		break;
+		
+	default:
+		status = -EINVAL;
+
+	}
+	return status;
+}
+
+static inline void
+__ipq_reset(void)
+{
+	peer_pid = 0;
+	net_disable_timestamp();
+	__ipq_set_mode(IPQ_COPY_NONE, 0);
+	__ipq_flush(NF_DROP);
+}
+
+static struct ipq_queue_entry *
+ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data)
+{
+	struct ipq_queue_entry *entry;
+	
+	write_lock_bh(&queue_lock);
+	entry = __ipq_find_dequeue_entry(cmpfn, data);
+	write_unlock_bh(&queue_lock);
+	return entry;
+}
+
+static void
+ipq_flush(int verdict)
+{
+	write_lock_bh(&queue_lock);
+	__ipq_flush(verdict);
+	write_unlock_bh(&queue_lock);
+}
+
+static struct sk_buff *
+ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
+{
+	unsigned char *old_tail;
+	size_t size = 0;
+	size_t data_len = 0;
+	struct sk_buff *skb;
+	struct ipq_packet_msg *pmsg;
+	struct nlmsghdr *nlh;
+
+	read_lock_bh(&queue_lock);
+	
+	switch (copy_mode) {
+	case IPQ_COPY_META:
+	case IPQ_COPY_NONE:
+		size = NLMSG_SPACE(sizeof(*pmsg));
+		data_len = 0;
+		break;
+	
+	case IPQ_COPY_PACKET:
+		if (copy_range == 0 || copy_range > entry->skb->len)
+			data_len = entry->skb->len;
+		else
+			data_len = copy_range;
+		
+		size = NLMSG_SPACE(sizeof(*pmsg) + data_len);
+		break;
+	
+	default:
+		*errp = -EINVAL;
+		read_unlock_bh(&queue_lock);
+		return NULL;
+	}
+
+	read_unlock_bh(&queue_lock);
+
+	skb = alloc_skb(size, GFP_ATOMIC);
+	if (!skb)
+		goto nlmsg_failure;
+		
+	old_tail= skb->tail;
+	nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh));
+	pmsg = NLMSG_DATA(nlh);
+	memset(pmsg, 0, sizeof(*pmsg));
+
+	pmsg->packet_id       = (unsigned long )entry;
+	pmsg->data_len        = data_len;
+	pmsg->timestamp_sec   = entry->skb->stamp.tv_sec;
+	pmsg->timestamp_usec  = entry->skb->stamp.tv_usec;
+	pmsg->mark            = entry->skb->nfmark;
+	pmsg->hook            = entry->info->hook;
+	pmsg->hw_protocol     = entry->skb->protocol;
+	
+	if (entry->info->indev)
+		strcpy(pmsg->indev_name, entry->info->indev->name);
+	else
+		pmsg->indev_name[0] = '\0';
+	
+	if (entry->info->outdev)
+		strcpy(pmsg->outdev_name, entry->info->outdev->name);
+	else
+		pmsg->outdev_name[0] = '\0';
+	
+	if (entry->info->indev && entry->skb->dev) {
+		pmsg->hw_type = entry->skb->dev->type;
+		if (entry->skb->dev->hard_header_parse)
+			pmsg->hw_addrlen =
+				entry->skb->dev->hard_header_parse(entry->skb,
+				                                   pmsg->hw_addr);
+	}
+	
+	if (data_len)
+		if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len))
+			BUG();
+		
+	nlh->nlmsg_len = skb->tail - old_tail;
+	return skb;
+
+nlmsg_failure:
+	if (skb)
+		kfree_skb(skb);
+	*errp = -EINVAL;
+	printk(KERN_ERR "ip_queue: error creating packet message\n");
+	return NULL;
+}
+
+static int
+ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data)
+{
+	int status = -EINVAL;
+	struct sk_buff *nskb;
+	struct ipq_queue_entry *entry;
+
+	if (copy_mode == IPQ_COPY_NONE)
+		return -EAGAIN;
+
+	entry = kmalloc(sizeof(*entry), GFP_ATOMIC);
+	if (entry == NULL) {
+		printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n");
+		return -ENOMEM;
+	}
+
+	entry->info = info;
+	entry->skb = skb;
+
+	if (entry->info->hook == NF_IP_LOCAL_OUT) {
+		struct iphdr *iph = skb->nh.iph;
+
+		entry->rt_info.tos = iph->tos;
+		entry->rt_info.daddr = iph->daddr;
+		entry->rt_info.saddr = iph->saddr;
+	}
+
+	nskb = ipq_build_packet_message(entry, &status);
+	if (nskb == NULL)
+		goto err_out_free;
+		
+	write_lock_bh(&queue_lock);
+	
+	if (!peer_pid)
+		goto err_out_free_nskb; 
+
+	if (queue_total >= queue_maxlen) {
+                queue_dropped++;
+		status = -ENOSPC;
+		if (net_ratelimit())
+		          printk (KERN_WARNING "ip_queue: full at %d entries, "
+				  "dropping packets(s). Dropped: %d\n", queue_total,
+				  queue_dropped);
+		goto err_out_free_nskb;
+	}
+
+ 	/* netlink_unicast will either free the nskb or attach it to a socket */ 
+	status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT);
+	if (status < 0) {
+	        queue_user_dropped++;
+		goto err_out_unlock;
+	}
+
+	__ipq_enqueue_entry(entry);
+
+	write_unlock_bh(&queue_lock);
+	return status;
+
+err_out_free_nskb:
+	kfree_skb(nskb); 
+	
+err_out_unlock:
+	write_unlock_bh(&queue_lock);
+
+err_out_free:
+	kfree(entry);
+	return status;
+}
+
+static int
+ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e)
+{
+	int diff;
+	struct iphdr *user_iph = (struct iphdr *)v->payload;
+
+	if (v->data_len < sizeof(*user_iph))
+		return 0;
+	diff = v->data_len - e->skb->len;
+	if (diff < 0)
+		skb_trim(e->skb, v->data_len);
+	else if (diff > 0) {
+		if (v->data_len > 0xFFFF)
+			return -EINVAL;
+		if (diff > skb_tailroom(e->skb)) {
+			struct sk_buff *newskb;
+			
+			newskb = skb_copy_expand(e->skb,
+			                         skb_headroom(e->skb),
+			                         diff,
+			                         GFP_ATOMIC);
+			if (newskb == NULL) {
+				printk(KERN_WARNING "ip_queue: OOM "
+				      "in mangle, dropping packet\n");
+				return -ENOMEM;
+			}
+			if (e->skb->sk)
+				skb_set_owner_w(newskb, e->skb->sk);
+			kfree_skb(e->skb);
+			e->skb = newskb;
+		}
+		skb_put(e->skb, diff);
+	}
+	if (!skb_ip_make_writable(&e->skb, v->data_len))
+		return -ENOMEM;
+	memcpy(e->skb->data, v->payload, v->data_len);
+	e->skb->nfcache |= NFC_ALTERED;
+
+	/*
+	 * Extra routing may needed on local out, as the QUEUE target never
+	 * returns control to the table.
+	 */
+	if (e->info->hook == NF_IP_LOCAL_OUT) {
+		struct iphdr *iph = e->skb->nh.iph;
+
+		if (!(iph->tos == e->rt_info.tos
+		      && iph->daddr == e->rt_info.daddr
+		      && iph->saddr == e->rt_info.saddr))
+			return ip_route_me_harder(&e->skb);
+	}
+	return 0;
+}
+
+static inline int
+id_cmp(struct ipq_queue_entry *e, unsigned long id)
+{
+	return (id == (unsigned long )e);
+}
+
+static int
+ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len)
+{
+	struct ipq_queue_entry *entry;
+
+	if (vmsg->value > NF_MAX_VERDICT)
+		return -EINVAL;
+
+	entry = ipq_find_dequeue_entry(id_cmp, vmsg->id);
+	if (entry == NULL)
+		return -ENOENT;
+	else {
+		int verdict = vmsg->value;
+		
+		if (vmsg->data_len && vmsg->data_len == len)
+			if (ipq_mangle_ipv4(vmsg, entry) < 0)
+				verdict = NF_DROP;
+		
+		ipq_issue_verdict(entry, verdict);
+		return 0;
+	}
+}
+
+static int
+ipq_set_mode(unsigned char mode, unsigned int range)
+{
+	int status;
+
+	write_lock_bh(&queue_lock);
+	status = __ipq_set_mode(mode, range);
+	write_unlock_bh(&queue_lock);
+	return status;
+}
+
+static int
+ipq_receive_peer(struct ipq_peer_msg *pmsg,
+                 unsigned char type, unsigned int len)
+{
+	int status = 0;
+
+	if (len < sizeof(*pmsg))
+		return -EINVAL;
+
+	switch (type) {
+	case IPQM_MODE:
+		status = ipq_set_mode(pmsg->msg.mode.value,
+		                      pmsg->msg.mode.range);
+		break;
+		
+	case IPQM_VERDICT:
+		if (pmsg->msg.verdict.value > NF_MAX_VERDICT)
+			status = -EINVAL;
+		else
+			status = ipq_set_verdict(&pmsg->msg.verdict,
+			                         len - sizeof(*pmsg));
+			break;
+	default:
+		status = -EINVAL;
+	}
+	return status;
+}
+
+static int
+dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex)
+{
+	if (entry->info->indev)
+		if (entry->info->indev->ifindex == ifindex)
+			return 1;
+			
+	if (entry->info->outdev)
+		if (entry->info->outdev->ifindex == ifindex)
+			return 1;
+
+	return 0;
+}
+
+static void
+ipq_dev_drop(int ifindex)
+{
+	struct ipq_queue_entry *entry;
+	
+	while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL)
+		ipq_issue_verdict(entry, NF_DROP);
+}
+
+#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0)
+
+static inline void
+ipq_rcv_skb(struct sk_buff *skb)
+{
+	int status, type, pid, flags, nlmsglen, skblen;
+	struct nlmsghdr *nlh;
+
+	skblen = skb->len;
+	if (skblen < sizeof(*nlh))
+		return;
+
+	nlh = (struct nlmsghdr *)skb->data;
+	nlmsglen = nlh->nlmsg_len;
+	if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen)
+		return;
+
+	pid = nlh->nlmsg_pid;
+	flags = nlh->nlmsg_flags;
+	
+	if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI)
+		RCV_SKB_FAIL(-EINVAL);
+		
+	if (flags & MSG_TRUNC)
+		RCV_SKB_FAIL(-ECOMM);
+		
+	type = nlh->nlmsg_type;
+	if (type < NLMSG_NOOP || type >= IPQM_MAX)
+		RCV_SKB_FAIL(-EINVAL);
+		
+	if (type <= IPQM_BASE)
+		return;
+		
+	if (security_netlink_recv(skb))
+		RCV_SKB_FAIL(-EPERM);
+	
+	write_lock_bh(&queue_lock);
+	
+	if (peer_pid) {
+		if (peer_pid != pid) {
+			write_unlock_bh(&queue_lock);
+			RCV_SKB_FAIL(-EBUSY);
+		}
+	} else {
+		net_enable_timestamp();
+		peer_pid = pid;
+	}
+		
+	write_unlock_bh(&queue_lock);
+	
+	status = ipq_receive_peer(NLMSG_DATA(nlh), type,
+	                          skblen - NLMSG_LENGTH(0));
+	if (status < 0)
+		RCV_SKB_FAIL(status);
+		
+	if (flags & NLM_F_ACK)
+		netlink_ack(skb, nlh, 0);
+        return;
+}
+
+static void
+ipq_rcv_sk(struct sock *sk, int len)
+{
+	do {
+		struct sk_buff *skb;
+
+		if (down_trylock(&ipqnl_sem))
+			return;
+			
+		while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+			ipq_rcv_skb(skb);
+			kfree_skb(skb);
+		}
+		
+		up(&ipqnl_sem);
+
+	} while (ipqnl && ipqnl->sk_receive_queue.qlen);
+}
+
+static int
+ipq_rcv_dev_event(struct notifier_block *this,
+                  unsigned long event, void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	/* Drop any packets associated with the downed device */
+	if (event == NETDEV_DOWN)
+		ipq_dev_drop(dev->ifindex);
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_dev_notifier = {
+	.notifier_call	= ipq_rcv_dev_event,
+};
+
+static int
+ipq_rcv_nl_event(struct notifier_block *this,
+                 unsigned long event, void *ptr)
+{
+	struct netlink_notify *n = ptr;
+
+	if (event == NETLINK_URELEASE &&
+	    n->protocol == NETLINK_FIREWALL && n->pid) {
+		write_lock_bh(&queue_lock);
+		if (n->pid == peer_pid)
+			__ipq_reset();
+		write_unlock_bh(&queue_lock);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ipq_nl_notifier = {
+	.notifier_call	= ipq_rcv_nl_event,
+};
+
+static struct ctl_table_header *ipq_sysctl_header;
+
+static ctl_table ipq_table[] = {
+	{
+		.ctl_name	= NET_IPQ_QMAX,
+		.procname	= NET_IPQ_QMAX_NAME,
+		.data		= &queue_maxlen,
+		.maxlen		= sizeof(queue_maxlen),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec
+	},
+ 	{ .ctl_name = 0 }
+};
+
+static ctl_table ipq_dir_table[] = {
+	{
+		.ctl_name	= NET_IPV4,
+		.procname	= "ipv4",
+		.mode		= 0555,
+		.child		= ipq_table
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table ipq_root_table[] = {
+	{
+		.ctl_name	= CTL_NET,
+		.procname	= "net",
+		.mode		= 0555,
+		.child		= ipq_dir_table
+	},
+	{ .ctl_name = 0 }
+};
+
+#ifdef CONFIG_PROC_FS
+static int
+ipq_get_info(char *buffer, char **start, off_t offset, int length)
+{
+	int len;
+
+	read_lock_bh(&queue_lock);
+	
+	len = sprintf(buffer,
+	              "Peer PID          : %d\n"
+	              "Copy mode         : %hu\n"
+	              "Copy range        : %u\n"
+	              "Queue length      : %u\n"
+	              "Queue max. length : %u\n"
+		      "Queue dropped     : %u\n"
+		      "Netlink dropped   : %u\n",
+	              peer_pid,
+	              copy_mode,
+	              copy_range,
+	              queue_total,
+	              queue_maxlen,
+		      queue_dropped,
+		      queue_user_dropped);
+
+	read_unlock_bh(&queue_lock);
+	
+	*start = buffer + offset;
+	len -= offset;
+	if (len > length)
+		len = length;
+	else if (len < 0)
+		len = 0;
+	return len;
+}
+#endif /* CONFIG_PROC_FS */
+
+static int
+init_or_cleanup(int init)
+{
+	int status = -ENOMEM;
+	struct proc_dir_entry *proc;
+	
+	if (!init)
+		goto cleanup;
+
+	netlink_register_notifier(&ipq_nl_notifier);
+	ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk);
+	if (ipqnl == NULL) {
+		printk(KERN_ERR "ip_queue: failed to create netlink socket\n");
+		goto cleanup_netlink_notifier;
+	}
+
+	proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info);
+	if (proc)
+		proc->owner = THIS_MODULE;
+	else {
+		printk(KERN_ERR "ip_queue: failed to create proc entry\n");
+		goto cleanup_ipqnl;
+	}
+	
+	register_netdevice_notifier(&ipq_dev_notifier);
+	ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0);
+	
+	status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL);
+	if (status < 0) {
+		printk(KERN_ERR "ip_queue: failed to register queue handler\n");
+		goto cleanup_sysctl;
+	}
+	return status;
+
+cleanup:
+	nf_unregister_queue_handler(PF_INET);
+	synchronize_net();
+	ipq_flush(NF_DROP);
+	
+cleanup_sysctl:
+	unregister_sysctl_table(ipq_sysctl_header);
+	unregister_netdevice_notifier(&ipq_dev_notifier);
+	proc_net_remove(IPQ_PROC_FS_NAME);
+	
+cleanup_ipqnl:
+	sock_release(ipqnl->sk_socket);
+	down(&ipqnl_sem);
+	up(&ipqnl_sem);
+	
+cleanup_netlink_notifier:
+	netlink_unregister_notifier(&ipq_nl_notifier);
+	return status;
+}
+
+static int __init init(void)
+{
+	
+	return init_or_cleanup(1);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(0);
+}
+
+MODULE_DESCRIPTION("IPv4 packet queue handler");
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_LICENSE("GPL");
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
new file mode 100644
index 000000000000..8a54f92b8496
--- /dev/null
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -0,0 +1,1964 @@
+/*
+ * Packet matching code.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 19 Jan 2002 Harald Welte <laforge@gnumonks.org>
+ * 	- increase module usage count as soon as we have rules inside
+ * 	  a table
+ */
+#include <linux/config.h>
+#include <linux/cache.h>
+#include <linux/skbuff.h>
+#include <linux/kmod.h>
+#include <linux/vmalloc.h>
+#include <linux/netdevice.h>
+#include <linux/module.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <linux/proc_fs.h>
+#include <linux/err.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("IPv4 packet filter");
+
+/*#define DEBUG_IP_FIREWALL*/
+/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */
+/*#define DEBUG_IP_FIREWALL_USER*/
+
+#ifdef DEBUG_IP_FIREWALL
+#define dprintf(format, args...)  printk(format , ## args)
+#else
+#define dprintf(format, args...)
+#endif
+
+#ifdef DEBUG_IP_FIREWALL_USER
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#ifdef CONFIG_NETFILTER_DEBUG
+#define IP_NF_ASSERT(x)						\
+do {								\
+	if (!(x))						\
+		printk("IP_NF_ASSERT: %s:%s:%u\n",		\
+		       __FUNCTION__, __FILE__, __LINE__);	\
+} while(0)
+#else
+#define IP_NF_ASSERT(x)
+#endif
+#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
+
+static DECLARE_MUTEX(ipt_mutex);
+
+/* Must have mutex */
+#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0)
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <linux/netfilter_ipv4/listhelp.h>
+
+#if 0
+/* All the better to debug you with... */
+#define static
+#define inline
+#endif
+
+/*
+   We keep a set of rules for each CPU, so we can avoid write-locking
+   them in the softirq when updating the counters and therefore
+   only need to read-lock in the softirq; doing a write_lock_bh() in user
+   context stops packets coming through and allows user context to read
+   the counters or update the rules.
+
+   To be cache friendly on SMP, we arrange them like so:
+   [ n-entries ]
+   ... cache-align padding ...
+   [ n-entries ]
+
+   Hence the start of any table is given by get_table() below.  */
+
+/* The table itself */
+struct ipt_table_info
+{
+	/* Size per table */
+	unsigned int size;
+	/* Number of entries: FIXME. --RR */
+	unsigned int number;
+	/* Initial number of entries. Needed for module usage count */
+	unsigned int initial_entries;
+
+	/* Entry points and underflows */
+	unsigned int hook_entry[NF_IP_NUMHOOKS];
+	unsigned int underflow[NF_IP_NUMHOOKS];
+
+	/* ipt_entry tables: one per CPU */
+	char entries[0] ____cacheline_aligned;
+};
+
+static LIST_HEAD(ipt_target);
+static LIST_HEAD(ipt_match);
+static LIST_HEAD(ipt_tables);
+#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+
+#ifdef CONFIG_SMP
+#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
+#else
+#define TABLE_OFFSET(t,p) 0
+#endif
+
+#if 0
+#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
+#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
+#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0)
+#endif
+
+/* Returns whether matches rule or not. */
+static inline int
+ip_packet_match(const struct iphdr *ip,
+		const char *indev,
+		const char *outdev,
+		const struct ipt_ip *ipinfo,
+		int isfrag)
+{
+	size_t i;
+	unsigned long ret;
+
+#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg))
+
+	if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+		  IPT_INV_SRCIP)
+	    || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+		     IPT_INV_DSTIP)) {
+		dprintf("Source or dest mismatch.\n");
+
+		dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n",
+			NIPQUAD(ip->saddr),
+			NIPQUAD(ipinfo->smsk.s_addr),
+			NIPQUAD(ipinfo->src.s_addr),
+			ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : "");
+		dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n",
+			NIPQUAD(ip->daddr),
+			NIPQUAD(ipinfo->dmsk.s_addr),
+			NIPQUAD(ipinfo->dst.s_addr),
+			ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : "");
+		return 0;
+	}
+
+	/* Look for ifname matches; this should unroll nicely. */
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+		ret |= (((const unsigned long *)indev)[i]
+			^ ((const unsigned long *)ipinfo->iniface)[i])
+			& ((const unsigned long *)ipinfo->iniface_mask)[i];
+	}
+
+	if (FWINV(ret != 0, IPT_INV_VIA_IN)) {
+		dprintf("VIA in mismatch (%s vs %s).%s\n",
+			indev, ipinfo->iniface,
+			ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":"");
+		return 0;
+	}
+
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) {
+		ret |= (((const unsigned long *)outdev)[i]
+			^ ((const unsigned long *)ipinfo->outiface)[i])
+			& ((const unsigned long *)ipinfo->outiface_mask)[i];
+	}
+
+	if (FWINV(ret != 0, IPT_INV_VIA_OUT)) {
+		dprintf("VIA out mismatch (%s vs %s).%s\n",
+			outdev, ipinfo->outiface,
+			ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":"");
+		return 0;
+	}
+
+	/* Check specific protocol */
+	if (ipinfo->proto
+	    && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+		dprintf("Packet protocol %hi does not match %hi.%s\n",
+			ip->protocol, ipinfo->proto,
+			ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
+		return 0;
+	}
+
+	/* If we have a fragment rule but the packet is not a fragment
+	 * then we return zero */
+	if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) {
+		dprintf("Fragment rule but not fragment.%s\n",
+			ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : "");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int
+ip_checkentry(const struct ipt_ip *ip)
+{
+	if (ip->flags & ~IPT_F_MASK) {
+		duprintf("Unknown flag bits set: %08X\n",
+			 ip->flags & ~IPT_F_MASK);
+		return 0;
+	}
+	if (ip->invflags & ~IPT_INV_MASK) {
+		duprintf("Unknown invflag bits set: %08X\n",
+			 ip->invflags & ~IPT_INV_MASK);
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned int
+ipt_error(struct sk_buff **pskb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  unsigned int hooknum,
+	  const void *targinfo,
+	  void *userinfo)
+{
+	if (net_ratelimit())
+		printk("ip_tables: error: `%s'\n", (char *)targinfo);
+
+	return NF_DROP;
+}
+
+static inline
+int do_match(struct ipt_entry_match *m,
+	     const struct sk_buff *skb,
+	     const struct net_device *in,
+	     const struct net_device *out,
+	     int offset,
+	     int *hotdrop)
+{
+	/* Stop iteration if it doesn't match */
+	if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop))
+		return 1;
+	else
+		return 0;
+}
+
+static inline struct ipt_entry *
+get_entry(void *base, unsigned int offset)
+{
+	return (struct ipt_entry *)(base + offset);
+}
+
+/* Returns one of the generic firewall policies, like NF_ACCEPT. */
+unsigned int
+ipt_do_table(struct sk_buff **pskb,
+	     unsigned int hook,
+	     const struct net_device *in,
+	     const struct net_device *out,
+	     struct ipt_table *table,
+	     void *userdata)
+{
+	static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long))));
+	u_int16_t offset;
+	struct iphdr *ip;
+	u_int16_t datalen;
+	int hotdrop = 0;
+	/* Initializing verdict to NF_DROP keeps gcc happy. */
+	unsigned int verdict = NF_DROP;
+	const char *indev, *outdev;
+	void *table_base;
+	struct ipt_entry *e, *back;
+
+	/* Initialization */
+	ip = (*pskb)->nh.iph;
+	datalen = (*pskb)->len - ip->ihl * 4;
+	indev = in ? in->name : nulldevname;
+	outdev = out ? out->name : nulldevname;
+	/* We handle fragments by dealing with the first fragment as
+	 * if it was a normal packet.  All other fragments are treated
+	 * normally, except that they will NEVER match rules that ask
+	 * things we don't know, ie. tcp syn flag or ports).  If the
+	 * rule is also a fragment-specific rule, non-fragments won't
+	 * match it. */
+	offset = ntohs(ip->frag_off) & IP_OFFSET;
+
+	read_lock_bh(&table->lock);
+	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	table_base = (void *)table->private->entries
+		+ TABLE_OFFSET(table->private, smp_processor_id());
+	e = get_entry(table_base, table->private->hook_entry[hook]);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	/* Check noone else using our table */
+	if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
+	    && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
+		printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
+		       smp_processor_id(),
+		       table->name,
+		       &((struct ipt_entry *)table_base)->comefrom,
+		       ((struct ipt_entry *)table_base)->comefrom);
+	}
+	((struct ipt_entry *)table_base)->comefrom = 0x57acc001;
+#endif
+
+	/* For return from builtin chain */
+	back = get_entry(table_base, table->private->underflow[hook]);
+
+	do {
+		IP_NF_ASSERT(e);
+		IP_NF_ASSERT(back);
+		(*pskb)->nfcache |= e->nfcache;
+		if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) {
+			struct ipt_entry_target *t;
+
+			if (IPT_MATCH_ITERATE(e, do_match,
+					      *pskb, in, out,
+					      offset, &hotdrop) != 0)
+				goto no_match;
+
+			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+
+			t = ipt_get_target(e);
+			IP_NF_ASSERT(t->u.kernel.target);
+			/* Standard target? */
+			if (!t->u.kernel.target->target) {
+				int v;
+
+				v = ((struct ipt_standard_target *)t)->verdict;
+				if (v < 0) {
+					/* Pop from stack? */
+					if (v != IPT_RETURN) {
+						verdict = (unsigned)(-v) - 1;
+						break;
+					}
+					e = back;
+					back = get_entry(table_base,
+							 back->comefrom);
+					continue;
+				}
+				if (table_base + v
+				    != (void *)e + e->next_offset) {
+					/* Save old back ptr in next entry */
+					struct ipt_entry *next
+						= (void *)e + e->next_offset;
+					next->comefrom
+						= (void *)back - table_base;
+					/* set back pointer to next entry */
+					back = next;
+				}
+
+				e = get_entry(table_base, v);
+			} else {
+				/* Targets which reenter must return
+                                   abs. verdicts */
+#ifdef CONFIG_NETFILTER_DEBUG
+				((struct ipt_entry *)table_base)->comefrom
+					= 0xeeeeeeec;
+#endif
+				verdict = t->u.kernel.target->target(pskb,
+								     in, out,
+								     hook,
+								     t->data,
+								     userdata);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+				if (((struct ipt_entry *)table_base)->comefrom
+				    != 0xeeeeeeec
+				    && verdict == IPT_CONTINUE) {
+					printk("Target %s reentered!\n",
+					       t->u.kernel.target->name);
+					verdict = NF_DROP;
+				}
+				((struct ipt_entry *)table_base)->comefrom
+					= 0x57acc001;
+#endif
+				/* Target might have changed stuff. */
+				ip = (*pskb)->nh.iph;
+				datalen = (*pskb)->len - ip->ihl * 4;
+
+				if (verdict == IPT_CONTINUE)
+					e = (void *)e + e->next_offset;
+				else
+					/* Verdict */
+					break;
+			}
+		} else {
+
+		no_match:
+			e = (void *)e + e->next_offset;
+		}
+	} while (!hotdrop);
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	((struct ipt_entry *)table_base)->comefrom = 0xdead57ac;
+#endif
+	read_unlock_bh(&table->lock);
+
+#ifdef DEBUG_ALLOW_ALL
+	return NF_ACCEPT;
+#else
+	if (hotdrop)
+		return NF_DROP;
+	else return verdict;
+#endif
+}
+
+/*
+ * These are weird, but module loading must not be done with mutex
+ * held (since they will register), and we have to have a single
+ * function to use try_then_request_module().
+ */
+
+/* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
+static inline struct ipt_table *find_table_lock(const char *name)
+{
+	struct ipt_table *t;
+
+	if (down_interruptible(&ipt_mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(t, &ipt_tables, list)
+		if (strcmp(t->name, name) == 0 && try_module_get(t->me))
+			return t;
+	up(&ipt_mutex);
+	return NULL;
+}
+
+/* Find match, grabs ref.  Returns ERR_PTR() on error. */
+static inline struct ipt_match *find_match(const char *name, u8 revision)
+{
+	struct ipt_match *m;
+	int err = 0;
+
+	if (down_interruptible(&ipt_mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(m, &ipt_match, list) {
+		if (strcmp(m->name, name) == 0) {
+			if (m->revision == revision) {
+				if (try_module_get(m->me)) {
+					up(&ipt_mutex);
+					return m;
+				}
+			} else
+				err = -EPROTOTYPE; /* Found something. */
+		}
+	}
+	up(&ipt_mutex);
+	return ERR_PTR(err);
+}
+
+/* Find target, grabs ref.  Returns ERR_PTR() on error. */
+static inline struct ipt_target *find_target(const char *name, u8 revision)
+{
+	struct ipt_target *t;
+	int err = 0;
+
+	if (down_interruptible(&ipt_mutex) != 0)
+		return ERR_PTR(-EINTR);
+
+	list_for_each_entry(t, &ipt_target, list) {
+		if (strcmp(t->name, name) == 0) {
+			if (t->revision == revision) {
+				if (try_module_get(t->me)) {
+					up(&ipt_mutex);
+					return t;
+				}
+			} else
+				err = -EPROTOTYPE; /* Found something. */
+		}
+	}
+	up(&ipt_mutex);
+	return ERR_PTR(err);
+}
+
+struct ipt_target *ipt_find_target(const char *name, u8 revision)
+{
+	struct ipt_target *target;
+
+	target = try_then_request_module(find_target(name, revision),
+					 "ipt_%s", name);
+	if (IS_ERR(target) || !target)
+		return NULL;
+	return target;
+}
+
+static int match_revfn(const char *name, u8 revision, int *bestp)
+{
+	struct ipt_match *m;
+	int have_rev = 0;
+
+	list_for_each_entry(m, &ipt_match, list) {
+		if (strcmp(m->name, name) == 0) {
+			if (m->revision > *bestp)
+				*bestp = m->revision;
+			if (m->revision == revision)
+				have_rev = 1;
+		}
+	}
+	return have_rev;
+}
+
+static int target_revfn(const char *name, u8 revision, int *bestp)
+{
+	struct ipt_target *t;
+	int have_rev = 0;
+
+	list_for_each_entry(t, &ipt_target, list) {
+		if (strcmp(t->name, name) == 0) {
+			if (t->revision > *bestp)
+				*bestp = t->revision;
+			if (t->revision == revision)
+				have_rev = 1;
+		}
+	}
+	return have_rev;
+}
+
+/* Returns true or false (if no such extension at all) */
+static inline int find_revision(const char *name, u8 revision,
+				int (*revfn)(const char *, u8, int *),
+				int *err)
+{
+	int have_rev, best = -1;
+
+	if (down_interruptible(&ipt_mutex) != 0) {
+		*err = -EINTR;
+		return 1;
+	}
+	have_rev = revfn(name, revision, &best);
+	up(&ipt_mutex);
+
+	/* Nothing at all?  Return 0 to try loading module. */
+	if (best == -1) {
+		*err = -ENOENT;
+		return 0;
+	}
+
+	*err = best;
+	if (!have_rev)
+		*err = -EPROTONOSUPPORT;
+	return 1;
+}
+
+
+/* All zeroes == unconditional rule. */
+static inline int
+unconditional(const struct ipt_ip *ip)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++)
+		if (((__u32 *)ip)[i])
+			return 0;
+
+	return 1;
+}
+
+/* Figures out from what hook each rule can be called: returns 0 if
+   there are loops.  Puts hook bitmask in comefrom. */
+static int
+mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+{
+	unsigned int hook;
+
+	/* No recursion; use packet counter to save back ptrs (reset
+	   to 0 as we leave), and comefrom to save source hook bitmask */
+	for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
+		unsigned int pos = newinfo->hook_entry[hook];
+		struct ipt_entry *e
+			= (struct ipt_entry *)(newinfo->entries + pos);
+
+		if (!(valid_hooks & (1 << hook)))
+			continue;
+
+		/* Set initial back pointer. */
+		e->counters.pcnt = pos;
+
+		for (;;) {
+			struct ipt_standard_target *t
+				= (void *)ipt_get_target(e);
+
+			if (e->comefrom & (1 << NF_IP_NUMHOOKS)) {
+				printk("iptables: loop hook %u pos %u %08X.\n",
+				       hook, pos, e->comefrom);
+				return 0;
+			}
+			e->comefrom
+				|= ((1 << hook) | (1 << NF_IP_NUMHOOKS));
+
+			/* Unconditional return/END. */
+			if (e->target_offset == sizeof(struct ipt_entry)
+			    && (strcmp(t->target.u.user.name,
+				       IPT_STANDARD_TARGET) == 0)
+			    && t->verdict < 0
+			    && unconditional(&e->ip)) {
+				unsigned int oldpos, size;
+
+				/* Return: backtrack through the last
+				   big jump. */
+				do {
+					e->comefrom ^= (1<<NF_IP_NUMHOOKS);
+#ifdef DEBUG_IP_FIREWALL_USER
+					if (e->comefrom
+					    & (1 << NF_IP_NUMHOOKS)) {
+						duprintf("Back unset "
+							 "on hook %u "
+							 "rule %u\n",
+							 hook, pos);
+					}
+#endif
+					oldpos = pos;
+					pos = e->counters.pcnt;
+					e->counters.pcnt = 0;
+
+					/* We're at the start. */
+					if (pos == oldpos)
+						goto next;
+
+					e = (struct ipt_entry *)
+						(newinfo->entries + pos);
+				} while (oldpos == pos + e->next_offset);
+
+				/* Move along one */
+				size = e->next_offset;
+				e = (struct ipt_entry *)
+					(newinfo->entries + pos + size);
+				e->counters.pcnt = pos;
+				pos += size;
+			} else {
+				int newpos = t->verdict;
+
+				if (strcmp(t->target.u.user.name,
+					   IPT_STANDARD_TARGET) == 0
+				    && newpos >= 0) {
+					/* This a jump; chase it. */
+					duprintf("Jump rule %u -> %u\n",
+						 pos, newpos);
+				} else {
+					/* ... this is a fallthru */
+					newpos = pos + e->next_offset;
+				}
+				e = (struct ipt_entry *)
+					(newinfo->entries + newpos);
+				e->counters.pcnt = pos;
+				pos = newpos;
+			}
+		}
+		next:
+		duprintf("Finished chain %u\n", hook);
+	}
+	return 1;
+}
+
+static inline int
+cleanup_match(struct ipt_entry_match *m, unsigned int *i)
+{
+	if (i && (*i)-- == 0)
+		return 1;
+
+	if (m->u.kernel.match->destroy)
+		m->u.kernel.match->destroy(m->data,
+					   m->u.match_size - sizeof(*m));
+	module_put(m->u.kernel.match->me);
+	return 0;
+}
+
+static inline int
+standard_check(const struct ipt_entry_target *t,
+	       unsigned int max_offset)
+{
+	struct ipt_standard_target *targ = (void *)t;
+
+	/* Check standard info. */
+	if (t->u.target_size
+	    != IPT_ALIGN(sizeof(struct ipt_standard_target))) {
+		duprintf("standard_check: target size %u != %u\n",
+			 t->u.target_size,
+			 IPT_ALIGN(sizeof(struct ipt_standard_target)));
+		return 0;
+	}
+
+	if (targ->verdict >= 0
+	    && targ->verdict > max_offset - sizeof(struct ipt_entry)) {
+		duprintf("ipt_standard_check: bad verdict (%i)\n",
+			 targ->verdict);
+		return 0;
+	}
+
+	if (targ->verdict < -NF_MAX_VERDICT - 1) {
+		duprintf("ipt_standard_check: bad negative verdict (%i)\n",
+			 targ->verdict);
+		return 0;
+	}
+	return 1;
+}
+
+static inline int
+check_match(struct ipt_entry_match *m,
+	    const char *name,
+	    const struct ipt_ip *ip,
+	    unsigned int hookmask,
+	    unsigned int *i)
+{
+	struct ipt_match *match;
+
+	match = try_then_request_module(find_match(m->u.user.name,
+						   m->u.user.revision),
+					"ipt_%s", m->u.user.name);
+	if (IS_ERR(match) || !match) {
+		duprintf("check_match: `%s' not found\n", m->u.user.name);
+		return match ? PTR_ERR(match) : -ENOENT;
+	}
+	m->u.kernel.match = match;
+
+	if (m->u.kernel.match->checkentry
+	    && !m->u.kernel.match->checkentry(name, ip, m->data,
+					      m->u.match_size - sizeof(*m),
+					      hookmask)) {
+		module_put(m->u.kernel.match->me);
+		duprintf("ip_tables: check failed for `%s'.\n",
+			 m->u.kernel.match->name);
+		return -EINVAL;
+	}
+
+	(*i)++;
+	return 0;
+}
+
+static struct ipt_target ipt_standard_target;
+
+static inline int
+check_entry(struct ipt_entry *e, const char *name, unsigned int size,
+	    unsigned int *i)
+{
+	struct ipt_entry_target *t;
+	struct ipt_target *target;
+	int ret;
+	unsigned int j;
+
+	if (!ip_checkentry(&e->ip)) {
+		duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+		return -EINVAL;
+	}
+
+	j = 0;
+	ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j);
+	if (ret != 0)
+		goto cleanup_matches;
+
+	t = ipt_get_target(e);
+	target = try_then_request_module(find_target(t->u.user.name,
+						     t->u.user.revision),
+					 "ipt_%s", t->u.user.name);
+	if (IS_ERR(target) || !target) {
+		duprintf("check_entry: `%s' not found\n", t->u.user.name);
+		ret = target ? PTR_ERR(target) : -ENOENT;
+		goto cleanup_matches;
+	}
+	t->u.kernel.target = target;
+
+	if (t->u.kernel.target == &ipt_standard_target) {
+		if (!standard_check(t, size)) {
+			ret = -EINVAL;
+			goto cleanup_matches;
+		}
+	} else if (t->u.kernel.target->checkentry
+		   && !t->u.kernel.target->checkentry(name, e, t->data,
+						      t->u.target_size
+						      - sizeof(*t),
+						      e->comefrom)) {
+		module_put(t->u.kernel.target->me);
+		duprintf("ip_tables: check failed for `%s'.\n",
+			 t->u.kernel.target->name);
+		ret = -EINVAL;
+		goto cleanup_matches;
+	}
+
+	(*i)++;
+	return 0;
+
+ cleanup_matches:
+	IPT_MATCH_ITERATE(e, cleanup_match, &j);
+	return ret;
+}
+
+static inline int
+check_entry_size_and_hooks(struct ipt_entry *e,
+			   struct ipt_table_info *newinfo,
+			   unsigned char *base,
+			   unsigned char *limit,
+			   const unsigned int *hook_entries,
+			   const unsigned int *underflows,
+			   unsigned int *i)
+{
+	unsigned int h;
+
+	if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
+	    || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+		duprintf("Bad offset %p\n", e);
+		return -EINVAL;
+	}
+
+	if (e->next_offset
+	    < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+		duprintf("checking: element %p size %u\n",
+			 e, e->next_offset);
+		return -EINVAL;
+	}
+
+	/* Check hooks & underflows */
+	for (h = 0; h < NF_IP_NUMHOOKS; h++) {
+		if ((unsigned char *)e - base == hook_entries[h])
+			newinfo->hook_entry[h] = hook_entries[h];
+		if ((unsigned char *)e - base == underflows[h])
+			newinfo->underflow[h] = underflows[h];
+	}
+
+	/* FIXME: underflows must be unconditional, standard verdicts
+           < 0 (not IPT_RETURN). --RR */
+
+	/* Clear counters and comefrom */
+	e->counters = ((struct ipt_counters) { 0, 0 });
+	e->comefrom = 0;
+
+	(*i)++;
+	return 0;
+}
+
+static inline int
+cleanup_entry(struct ipt_entry *e, unsigned int *i)
+{
+	struct ipt_entry_target *t;
+
+	if (i && (*i)-- == 0)
+		return 1;
+
+	/* Cleanup all matches */
+	IPT_MATCH_ITERATE(e, cleanup_match, NULL);
+	t = ipt_get_target(e);
+	if (t->u.kernel.target->destroy)
+		t->u.kernel.target->destroy(t->data,
+					    t->u.target_size - sizeof(*t));
+	module_put(t->u.kernel.target->me);
+	return 0;
+}
+
+/* Checks and translates the user-supplied table segment (held in
+   newinfo) */
+static int
+translate_table(const char *name,
+		unsigned int valid_hooks,
+		struct ipt_table_info *newinfo,
+		unsigned int size,
+		unsigned int number,
+		const unsigned int *hook_entries,
+		const unsigned int *underflows)
+{
+	unsigned int i;
+	int ret;
+
+	newinfo->size = size;
+	newinfo->number = number;
+
+	/* Init all hooks to impossible value. */
+	for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+		newinfo->hook_entry[i] = 0xFFFFFFFF;
+		newinfo->underflow[i] = 0xFFFFFFFF;
+	}
+
+	duprintf("translate_table: size %u\n", newinfo->size);
+	i = 0;
+	/* Walk through entries, checking offsets. */
+	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+				check_entry_size_and_hooks,
+				newinfo,
+				newinfo->entries,
+				newinfo->entries + size,
+				hook_entries, underflows, &i);
+	if (ret != 0)
+		return ret;
+
+	if (i != number) {
+		duprintf("translate_table: %u not %u entries\n",
+			 i, number);
+		return -EINVAL;
+	}
+
+	/* Check hooks all assigned */
+	for (i = 0; i < NF_IP_NUMHOOKS; i++) {
+		/* Only hooks which are valid */
+		if (!(valid_hooks & (1 << i)))
+			continue;
+		if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
+			duprintf("Invalid hook entry %u %u\n",
+				 i, hook_entries[i]);
+			return -EINVAL;
+		}
+		if (newinfo->underflow[i] == 0xFFFFFFFF) {
+			duprintf("Invalid underflow %u %u\n",
+				 i, underflows[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (!mark_source_chains(newinfo, valid_hooks))
+		return -ELOOP;
+
+	/* Finally, each sanity check must pass */
+	i = 0;
+	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+				check_entry, name, size, &i);
+
+	if (ret != 0) {
+		IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+				  cleanup_entry, &i);
+		return ret;
+	}
+
+	/* And one copy for every other CPU */
+	for (i = 1; i < num_possible_cpus(); i++) {
+		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+		       newinfo->entries,
+		       SMP_ALIGN(newinfo->size));
+	}
+
+	return ret;
+}
+
+static struct ipt_table_info *
+replace_table(struct ipt_table *table,
+	      unsigned int num_counters,
+	      struct ipt_table_info *newinfo,
+	      int *error)
+{
+	struct ipt_table_info *oldinfo;
+
+#ifdef CONFIG_NETFILTER_DEBUG
+	{
+		struct ipt_entry *table_base;
+		unsigned int i;
+
+		for (i = 0; i < num_possible_cpus(); i++) {
+			table_base =
+				(void *)newinfo->entries
+				+ TABLE_OFFSET(newinfo, i);
+
+			table_base->comefrom = 0xdead57ac;
+		}
+	}
+#endif
+
+	/* Do the substitution. */
+	write_lock_bh(&table->lock);
+	/* Check inside lock: is the old number correct? */
+	if (num_counters != table->private->number) {
+		duprintf("num_counters != table->private->number (%u/%u)\n",
+			 num_counters, table->private->number);
+		write_unlock_bh(&table->lock);
+		*error = -EAGAIN;
+		return NULL;
+	}
+	oldinfo = table->private;
+	table->private = newinfo;
+	newinfo->initial_entries = oldinfo->initial_entries;
+	write_unlock_bh(&table->lock);
+
+	return oldinfo;
+}
+
+/* Gets counters. */
+static inline int
+add_entry_to_counter(const struct ipt_entry *e,
+		     struct ipt_counters total[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+static void
+get_counters(const struct ipt_table_info *t,
+	     struct ipt_counters counters[])
+{
+	unsigned int cpu;
+	unsigned int i;
+
+	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+		i = 0;
+		IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+				  t->size,
+				  add_entry_to_counter,
+				  counters,
+				  &i);
+	}
+}
+
+static int
+copy_entries_to_user(unsigned int total_size,
+		     struct ipt_table *table,
+		     void __user *userptr)
+{
+	unsigned int off, num, countersize;
+	struct ipt_entry *e;
+	struct ipt_counters *counters;
+	int ret = 0;
+
+	/* We need atomic snapshot of counters: rest doesn't change
+	   (other than comefrom, which userspace doesn't care
+	   about). */
+	countersize = sizeof(struct ipt_counters) * table->private->number;
+	counters = vmalloc(countersize);
+
+	if (counters == NULL)
+		return -ENOMEM;
+
+	/* First, sum counters... */
+	memset(counters, 0, countersize);
+	write_lock_bh(&table->lock);
+	get_counters(table->private, counters);
+	write_unlock_bh(&table->lock);
+
+	/* ... then copy entire thing from CPU 0... */
+	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+		ret = -EFAULT;
+		goto free_counters;
+	}
+
+	/* FIXME: use iterator macros --RR */
+	/* ... then go back and fix counters and names */
+	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
+		unsigned int i;
+		struct ipt_entry_match *m;
+		struct ipt_entry_target *t;
+
+		e = (struct ipt_entry *)(table->private->entries + off);
+		if (copy_to_user(userptr + off
+				 + offsetof(struct ipt_entry, counters),
+				 &counters[num],
+				 sizeof(counters[num])) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+
+		for (i = sizeof(struct ipt_entry);
+		     i < e->target_offset;
+		     i += m->u.match_size) {
+			m = (void *)e + i;
+
+			if (copy_to_user(userptr + off + i
+					 + offsetof(struct ipt_entry_match,
+						    u.user.name),
+					 m->u.kernel.match->name,
+					 strlen(m->u.kernel.match->name)+1)
+			    != 0) {
+				ret = -EFAULT;
+				goto free_counters;
+			}
+		}
+
+		t = ipt_get_target(e);
+		if (copy_to_user(userptr + off + e->target_offset
+				 + offsetof(struct ipt_entry_target,
+					    u.user.name),
+				 t->u.kernel.target->name,
+				 strlen(t->u.kernel.target->name)+1) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+	}
+
+ free_counters:
+	vfree(counters);
+	return ret;
+}
+
+static int
+get_entries(const struct ipt_get_entries *entries,
+	    struct ipt_get_entries __user *uptr)
+{
+	int ret;
+	struct ipt_table *t;
+
+	t = find_table_lock(entries->name);
+	if (t && !IS_ERR(t)) {
+		duprintf("t->private->number = %u\n",
+			 t->private->number);
+		if (entries->size == t->private->size)
+			ret = copy_entries_to_user(t->private->size,
+						   t, uptr->entrytable);
+		else {
+			duprintf("get_entries: I've got %u not %u!\n",
+				 t->private->size,
+				 entries->size);
+			ret = -EINVAL;
+		}
+		module_put(t->me);
+		up(&ipt_mutex);
+	} else
+		ret = t ? PTR_ERR(t) : -ENOENT;
+
+	return ret;
+}
+
+static int
+do_replace(void __user *user, unsigned int len)
+{
+	int ret;
+	struct ipt_replace tmp;
+	struct ipt_table *t;
+	struct ipt_table_info *newinfo, *oldinfo;
+	struct ipt_counters *counters;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	/* Hack: Causes ipchains to give correct error msg --RR */
+	if (len != sizeof(tmp) + tmp.size)
+		return -ENOPROTOOPT;
+
+	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
+	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
+		return -ENOMEM;
+
+	newinfo = vmalloc(sizeof(struct ipt_table_info)
+			  + SMP_ALIGN(tmp.size) * num_possible_cpus());
+	if (!newinfo)
+		return -ENOMEM;
+
+	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+			   tmp.size) != 0) {
+		ret = -EFAULT;
+		goto free_newinfo;
+	}
+
+	counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters));
+	if (!counters) {
+		ret = -ENOMEM;
+		goto free_newinfo;
+	}
+	memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
+
+	ret = translate_table(tmp.name, tmp.valid_hooks,
+			      newinfo, tmp.size, tmp.num_entries,
+			      tmp.hook_entry, tmp.underflow);
+	if (ret != 0)
+		goto free_newinfo_counters;
+
+	duprintf("ip_tables: Translated table\n");
+
+	t = try_then_request_module(find_table_lock(tmp.name),
+				    "iptable_%s", tmp.name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free_newinfo_counters_untrans;
+	}
+
+	/* You lied! */
+	if (tmp.valid_hooks != t->valid_hooks) {
+		duprintf("Valid hook crap: %08X vs %08X\n",
+			 tmp.valid_hooks, t->valid_hooks);
+		ret = -EINVAL;
+		goto put_module;
+	}
+
+	oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret);
+	if (!oldinfo)
+		goto put_module;
+
+	/* Update module usage count based on number of rules */
+	duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n",
+		oldinfo->number, oldinfo->initial_entries, newinfo->number);
+	if ((oldinfo->number > oldinfo->initial_entries) || 
+	    (newinfo->number <= oldinfo->initial_entries)) 
+		module_put(t->me);
+	if ((oldinfo->number > oldinfo->initial_entries) &&
+	    (newinfo->number <= oldinfo->initial_entries))
+		module_put(t->me);
+
+	/* Get the old counters. */
+	get_counters(oldinfo, counters);
+	/* Decrease module usage counts and free resource */
+	IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
+	vfree(oldinfo);
+	if (copy_to_user(tmp.counters, counters,
+			 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
+		ret = -EFAULT;
+	vfree(counters);
+	up(&ipt_mutex);
+	return ret;
+
+ put_module:
+	module_put(t->me);
+	up(&ipt_mutex);
+ free_newinfo_counters_untrans:
+	IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ free_newinfo_counters:
+	vfree(counters);
+ free_newinfo:
+	vfree(newinfo);
+	return ret;
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static inline int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct ipt_counters addme[],
+		     unsigned int *i)
+{
+#if 0
+	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
+		 *i,
+		 (long unsigned int)e->counters.pcnt,
+		 (long unsigned int)e->counters.bcnt,
+		 (long unsigned int)addme[*i].pcnt,
+		 (long unsigned int)addme[*i].bcnt);
+#endif
+
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+static int
+do_add_counters(void __user *user, unsigned int len)
+{
+	unsigned int i;
+	struct ipt_counters_info tmp, *paddc;
+	struct ipt_table *t;
+	int ret = 0;
+
+	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
+		return -EFAULT;
+
+	if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters))
+		return -EINVAL;
+
+	paddc = vmalloc(len);
+	if (!paddc)
+		return -ENOMEM;
+
+	if (copy_from_user(paddc, user, len) != 0) {
+		ret = -EFAULT;
+		goto free;
+	}
+
+	t = find_table_lock(tmp.name);
+	if (!t || IS_ERR(t)) {
+		ret = t ? PTR_ERR(t) : -ENOENT;
+		goto free;
+	}
+
+	write_lock_bh(&t->lock);
+	if (t->private->number != paddc->num_counters) {
+		ret = -EINVAL;
+		goto unlock_up_free;
+	}
+
+	i = 0;
+	IPT_ENTRY_ITERATE(t->private->entries,
+			  t->private->size,
+			  add_counter_to_entry,
+			  paddc->counters,
+			  &i);
+ unlock_up_free:
+	write_unlock_bh(&t->lock);
+	up(&ipt_mutex);
+	module_put(t->me);
+ free:
+	vfree(paddc);
+
+	return ret;
+}
+
+static int
+do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user, unsigned int len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_SET_REPLACE:
+		ret = do_replace(user, len);
+		break;
+
+	case IPT_SO_SET_ADD_COUNTERS:
+		ret = do_add_counters(user, len);
+		break;
+
+	default:
+		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static int
+do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
+{
+	int ret;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case IPT_SO_GET_INFO: {
+		char name[IPT_TABLE_MAXNAMELEN];
+		struct ipt_table *t;
+
+		if (*len != sizeof(struct ipt_getinfo)) {
+			duprintf("length %u != %u\n", *len,
+				 sizeof(struct ipt_getinfo));
+			ret = -EINVAL;
+			break;
+		}
+
+		if (copy_from_user(name, user, sizeof(name)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+		name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+
+		t = try_then_request_module(find_table_lock(name),
+					    "iptable_%s", name);
+		if (t && !IS_ERR(t)) {
+			struct ipt_getinfo info;
+
+			info.valid_hooks = t->valid_hooks;
+			memcpy(info.hook_entry, t->private->hook_entry,
+			       sizeof(info.hook_entry));
+			memcpy(info.underflow, t->private->underflow,
+			       sizeof(info.underflow));
+			info.num_entries = t->private->number;
+			info.size = t->private->size;
+			memcpy(info.name, name, sizeof(info.name));
+
+			if (copy_to_user(user, &info, *len) != 0)
+				ret = -EFAULT;
+			else
+				ret = 0;
+			up(&ipt_mutex);
+			module_put(t->me);
+		} else
+			ret = t ? PTR_ERR(t) : -ENOENT;
+	}
+	break;
+
+	case IPT_SO_GET_ENTRIES: {
+		struct ipt_get_entries get;
+
+		if (*len < sizeof(get)) {
+			duprintf("get_entries: %u < %u\n", *len, sizeof(get));
+			ret = -EINVAL;
+		} else if (copy_from_user(&get, user, sizeof(get)) != 0) {
+			ret = -EFAULT;
+		} else if (*len != sizeof(struct ipt_get_entries) + get.size) {
+			duprintf("get_entries: %u != %u\n", *len,
+				 sizeof(struct ipt_get_entries) + get.size);
+			ret = -EINVAL;
+		} else
+			ret = get_entries(&get, user);
+		break;
+	}
+
+	case IPT_SO_GET_REVISION_MATCH:
+	case IPT_SO_GET_REVISION_TARGET: {
+		struct ipt_get_revision rev;
+		int (*revfn)(const char *, u8, int *);
+
+		if (*len != sizeof(rev)) {
+			ret = -EINVAL;
+			break;
+		}
+		if (copy_from_user(&rev, user, sizeof(rev)) != 0) {
+			ret = -EFAULT;
+			break;
+		}
+
+		if (cmd == IPT_SO_GET_REVISION_TARGET)
+			revfn = target_revfn;
+		else
+			revfn = match_revfn;
+
+		try_then_request_module(find_revision(rev.name, rev.revision,
+						      revfn, &ret),
+					"ipt_%s", rev.name);
+		break;
+	}
+
+	default:
+		duprintf("do_ipt_get_ctl: unknown request %i\n", cmd);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+/* Registration hooks for targets. */
+int
+ipt_register_target(struct ipt_target *target)
+{
+	int ret;
+
+	ret = down_interruptible(&ipt_mutex);
+	if (ret != 0)
+		return ret;
+	list_add(&target->list, &ipt_target);
+	up(&ipt_mutex);
+	return ret;
+}
+
+void
+ipt_unregister_target(struct ipt_target *target)
+{
+	down(&ipt_mutex);
+	LIST_DELETE(&ipt_target, target);
+	up(&ipt_mutex);
+}
+
+int
+ipt_register_match(struct ipt_match *match)
+{
+	int ret;
+
+	ret = down_interruptible(&ipt_mutex);
+	if (ret != 0)
+		return ret;
+
+	list_add(&match->list, &ipt_match);
+	up(&ipt_mutex);
+
+	return ret;
+}
+
+void
+ipt_unregister_match(struct ipt_match *match)
+{
+	down(&ipt_mutex);
+	LIST_DELETE(&ipt_match, match);
+	up(&ipt_mutex);
+}
+
+int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
+{
+	int ret;
+	struct ipt_table_info *newinfo;
+	static struct ipt_table_info bootstrap
+		= { 0, 0, 0, { 0 }, { 0 }, { } };
+
+	newinfo = vmalloc(sizeof(struct ipt_table_info)
+			  + SMP_ALIGN(repl->size) * num_possible_cpus());
+	if (!newinfo)
+		return -ENOMEM;
+
+	memcpy(newinfo->entries, repl->entries, repl->size);
+
+	ret = translate_table(table->name, table->valid_hooks,
+			      newinfo, repl->size,
+			      repl->num_entries,
+			      repl->hook_entry,
+			      repl->underflow);
+	if (ret != 0) {
+		vfree(newinfo);
+		return ret;
+	}
+
+	ret = down_interruptible(&ipt_mutex);
+	if (ret != 0) {
+		vfree(newinfo);
+		return ret;
+	}
+
+	/* Don't autoload: we'd eat our tail... */
+	if (list_named_find(&ipt_tables, table->name)) {
+		ret = -EEXIST;
+		goto free_unlock;
+	}
+
+	/* Simplifies replace_table code. */
+	table->private = &bootstrap;
+	if (!replace_table(table, 0, newinfo, &ret))
+		goto free_unlock;
+
+	duprintf("table->private->number = %u\n",
+		 table->private->number);
+	
+	/* save number of initial entries */
+	table->private->initial_entries = table->private->number;
+
+	rwlock_init(&table->lock);
+	list_prepend(&ipt_tables, table);
+
+ unlock:
+	up(&ipt_mutex);
+	return ret;
+
+ free_unlock:
+	vfree(newinfo);
+	goto unlock;
+}
+
+void ipt_unregister_table(struct ipt_table *table)
+{
+	down(&ipt_mutex);
+	LIST_DELETE(&ipt_tables, table);
+	up(&ipt_mutex);
+
+	/* Decrease module usage counts and free resources */
+	IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+			  cleanup_entry, NULL);
+	vfree(table->private);
+}
+
+/* Returns 1 if the port is matched by the range, 0 otherwise */
+static inline int
+port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert)
+{
+	int ret;
+
+	ret = (port >= min && port <= max) ^ invert;
+	return ret;
+}
+
+static int
+tcp_find_option(u_int8_t option,
+		const struct sk_buff *skb,
+		unsigned int optlen,
+		int invert,
+		int *hotdrop)
+{
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	u_int8_t _opt[60 - sizeof(struct tcphdr)], *op;
+	unsigned int i;
+
+	duprintf("tcp_match: finding option\n");
+
+	if (!optlen)
+		return invert;
+
+	/* If we don't have the whole header, drop packet. */
+	op = skb_header_pointer(skb,
+				skb->nh.iph->ihl*4 + sizeof(struct tcphdr),
+				optlen, _opt);
+	if (op == NULL) {
+		*hotdrop = 1;
+		return 0;
+	}
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == option) return !invert;
+		if (op[i] < 2) i++;
+		else i += op[i+1]?:1;
+	}
+
+	return invert;
+}
+
+static int
+tcp_match(const struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  const void *matchinfo,
+	  int offset,
+	  int *hotdrop)
+{
+	struct tcphdr _tcph, *th;
+	const struct ipt_tcp *tcpinfo = matchinfo;
+
+	if (offset) {
+		/* To quote Alan:
+
+		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
+		   causes this. Its a cracker trying to break in by doing a
+		   flag overwrite to pass the direction checks.
+		*/
+		if (offset == 1) {
+			duprintf("Dropping evil TCP offset=1 frag.\n");
+			*hotdrop = 1;
+		}
+		/* Must not be a fragment. */
+		return 0;
+	}
+
+#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
+
+	th = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		duprintf("Dropping evil TCP offset=0 tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+			ntohs(th->source),
+			!!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)))
+		return 0;
+	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+			ntohs(th->dest),
+			!!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)))
+		return 0;
+	if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask)
+		      == tcpinfo->flg_cmp,
+		      IPT_TCP_INV_FLAGS))
+		return 0;
+	if (tcpinfo->option) {
+		if (th->doff * 4 < sizeof(_tcph)) {
+			*hotdrop = 1;
+			return 0;
+		}
+		if (!tcp_find_option(tcpinfo->option, skb,
+				     th->doff*4 - sizeof(_tcph),
+				     tcpinfo->invflags & IPT_TCP_INV_OPTION,
+				     hotdrop))
+			return 0;
+	}
+	return 1;
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+tcp_checkentry(const char *tablename,
+	       const struct ipt_ip *ip,
+	       void *matchinfo,
+	       unsigned int matchsize,
+	       unsigned int hook_mask)
+{
+	const struct ipt_tcp *tcpinfo = matchinfo;
+
+	/* Must specify proto == TCP, and no unknown invflags */
+	return ip->proto == IPPROTO_TCP
+		&& !(ip->invflags & IPT_INV_PROTO)
+		&& matchsize == IPT_ALIGN(sizeof(struct ipt_tcp))
+		&& !(tcpinfo->invflags & ~IPT_TCP_INV_MASK);
+}
+
+static int
+udp_match(const struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  const void *matchinfo,
+	  int offset,
+	  int *hotdrop)
+{
+	struct udphdr _udph, *uh;
+	const struct ipt_udp *udpinfo = matchinfo;
+
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	uh = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				sizeof(_udph), &_udph);
+	if (uh == NULL) {
+		/* We've been asked to examine this packet, and we
+		   can't.  Hence, no choice but to drop. */
+		duprintf("Dropping evil UDP tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return port_match(udpinfo->spts[0], udpinfo->spts[1],
+			  ntohs(uh->source),
+			  !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
+		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
+			      ntohs(uh->dest),
+			      !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+udp_checkentry(const char *tablename,
+	       const struct ipt_ip *ip,
+	       void *matchinfo,
+	       unsigned int matchinfosize,
+	       unsigned int hook_mask)
+{
+	const struct ipt_udp *udpinfo = matchinfo;
+
+	/* Must specify proto == UDP, and no unknown invflags */
+	if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) {
+		duprintf("ipt_udp: Protocol %u != %u\n", ip->proto,
+			 IPPROTO_UDP);
+		return 0;
+	}
+	if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) {
+		duprintf("ipt_udp: matchsize %u != %u\n",
+			 matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp)));
+		return 0;
+	}
+	if (udpinfo->invflags & ~IPT_UDP_INV_MASK) {
+		duprintf("ipt_udp: unknown flags %X\n",
+			 udpinfo->invflags);
+		return 0;
+	}
+
+	return 1;
+}
+
+/* Returns 1 if the type and code is matched by the range, 0 otherwise */
+static inline int
+icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code,
+		     u_int8_t type, u_int8_t code,
+		     int invert)
+{
+	return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code))
+		^ invert;
+}
+
+static int
+icmp_match(const struct sk_buff *skb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   const void *matchinfo,
+	   int offset,
+	   int *hotdrop)
+{
+	struct icmphdr _icmph, *ic;
+	const struct ipt_icmp *icmpinfo = matchinfo;
+
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	ic = skb_header_pointer(skb, skb->nh.iph->ihl*4,
+				sizeof(_icmph), &_icmph);
+	if (ic == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("Dropping evil ICMP tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return icmp_type_code_match(icmpinfo->type,
+				    icmpinfo->code[0],
+				    icmpinfo->code[1],
+				    ic->type, ic->code,
+				    !!(icmpinfo->invflags&IPT_ICMP_INV));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+icmp_checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_icmp *icmpinfo = matchinfo;
+
+	/* Must specify proto == ICMP, and no unknown invflags */
+	return ip->proto == IPPROTO_ICMP
+		&& !(ip->invflags & IPT_INV_PROTO)
+		&& matchsize == IPT_ALIGN(sizeof(struct ipt_icmp))
+		&& !(icmpinfo->invflags & ~IPT_ICMP_INV);
+}
+
+/* The built-in targets: standard (NULL) and error. */
+static struct ipt_target ipt_standard_target = {
+	.name		= IPT_STANDARD_TARGET,
+};
+
+static struct ipt_target ipt_error_target = {
+	.name		= IPT_ERROR_TARGET,
+	.target		= ipt_error,
+};
+
+static struct nf_sockopt_ops ipt_sockopts = {
+	.pf		= PF_INET,
+	.set_optmin	= IPT_BASE_CTL,
+	.set_optmax	= IPT_SO_SET_MAX+1,
+	.set		= do_ipt_set_ctl,
+	.get_optmin	= IPT_BASE_CTL,
+	.get_optmax	= IPT_SO_GET_MAX+1,
+	.get		= do_ipt_get_ctl,
+};
+
+static struct ipt_match tcp_matchstruct = {
+	.name		= "tcp",
+	.match		= &tcp_match,
+	.checkentry	= &tcp_checkentry,
+};
+
+static struct ipt_match udp_matchstruct = {
+	.name		= "udp",
+	.match		= &udp_match,
+	.checkentry	= &udp_checkentry,
+};
+
+static struct ipt_match icmp_matchstruct = {
+	.name		= "icmp",
+	.match		= &icmp_match,
+	.checkentry	= &icmp_checkentry,
+};
+
+#ifdef CONFIG_PROC_FS
+static inline int print_name(const char *i,
+			     off_t start_offset, char *buffer, int length,
+			     off_t *pos, unsigned int *count)
+{
+	if ((*count)++ >= start_offset) {
+		unsigned int namelen;
+
+		namelen = sprintf(buffer + *pos, "%s\n",
+				  i + sizeof(struct list_head));
+		if (*pos + namelen > length) {
+			/* Stop iterating */
+			return 1;
+		}
+		*pos += namelen;
+	}
+	return 0;
+}
+
+static inline int print_target(const struct ipt_target *t,
+                               off_t start_offset, char *buffer, int length,
+                               off_t *pos, unsigned int *count)
+{
+	if (t == &ipt_standard_target || t == &ipt_error_target)
+		return 0;
+	return print_name((char *)t, start_offset, buffer, length, pos, count);
+}
+
+static int ipt_get_tables(char *buffer, char **start, off_t offset, int length)
+{
+	off_t pos = 0;
+	unsigned int count = 0;
+
+	if (down_interruptible(&ipt_mutex) != 0)
+		return 0;
+
+	LIST_FIND(&ipt_tables, print_name, void *,
+		  offset, buffer, length, &pos, &count);
+
+	up(&ipt_mutex);
+
+	/* `start' hack - see fs/proc/generic.c line ~105 */
+	*start=(char *)((unsigned long)count-offset);
+	return pos;
+}
+
+static int ipt_get_targets(char *buffer, char **start, off_t offset, int length)
+{
+	off_t pos = 0;
+	unsigned int count = 0;
+
+	if (down_interruptible(&ipt_mutex) != 0)
+		return 0;
+
+	LIST_FIND(&ipt_target, print_target, struct ipt_target *,
+		  offset, buffer, length, &pos, &count);
+	
+	up(&ipt_mutex);
+
+	*start = (char *)((unsigned long)count - offset);
+	return pos;
+}
+
+static int ipt_get_matches(char *buffer, char **start, off_t offset, int length)
+{
+	off_t pos = 0;
+	unsigned int count = 0;
+
+	if (down_interruptible(&ipt_mutex) != 0)
+		return 0;
+	
+	LIST_FIND(&ipt_match, print_name, void *,
+		  offset, buffer, length, &pos, &count);
+
+	up(&ipt_mutex);
+
+	*start = (char *)((unsigned long)count - offset);
+	return pos;
+}
+
+static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] =
+{ { "ip_tables_names", ipt_get_tables },
+  { "ip_tables_targets", ipt_get_targets },
+  { "ip_tables_matches", ipt_get_matches },
+  { NULL, NULL} };
+#endif /*CONFIG_PROC_FS*/
+
+static int __init init(void)
+{
+	int ret;
+
+	/* Noone else will be downing sem now, so we won't sleep */
+	down(&ipt_mutex);
+	list_append(&ipt_target, &ipt_standard_target);
+	list_append(&ipt_target, &ipt_error_target);
+	list_append(&ipt_match, &tcp_matchstruct);
+	list_append(&ipt_match, &udp_matchstruct);
+	list_append(&ipt_match, &icmp_matchstruct);
+	up(&ipt_mutex);
+
+	/* Register setsockopt */
+	ret = nf_register_sockopt(&ipt_sockopts);
+	if (ret < 0) {
+		duprintf("Unable to register sockopts.\n");
+		return ret;
+	}
+
+#ifdef CONFIG_PROC_FS
+	{
+	struct proc_dir_entry *proc;
+	int i;
+
+	for (i = 0; ipt_proc_entry[i].name; i++) {
+		proc = proc_net_create(ipt_proc_entry[i].name, 0,
+				       ipt_proc_entry[i].get_info);
+		if (!proc) {
+			while (--i >= 0)
+				proc_net_remove(ipt_proc_entry[i].name);
+			nf_unregister_sockopt(&ipt_sockopts);
+			return -ENOMEM;
+		}
+		proc->owner = THIS_MODULE;
+	}
+	}
+#endif
+
+	printk("ip_tables: (C) 2000-2002 Netfilter core team\n");
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	nf_unregister_sockopt(&ipt_sockopts);
+#ifdef CONFIG_PROC_FS
+	{
+	int i;
+	for (i = 0; ipt_proc_entry[i].name; i++)
+		proc_net_remove(ipt_proc_entry[i].name);
+	}
+#endif
+}
+
+EXPORT_SYMBOL(ipt_register_table);
+EXPORT_SYMBOL(ipt_unregister_table);
+EXPORT_SYMBOL(ipt_register_match);
+EXPORT_SYMBOL(ipt_unregister_match);
+EXPORT_SYMBOL(ipt_do_table);
+EXPORT_SYMBOL(ipt_register_target);
+EXPORT_SYMBOL(ipt_unregister_target);
+EXPORT_SYMBOL(ipt_find_target);
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c
new file mode 100644
index 000000000000..9842e6e23184
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLASSIFY.c
@@ -0,0 +1,92 @@
+/*
+ * This is a module which is used for setting the skb->priority field
+ * of an skb for qdisc classification.
+ */
+
+/* (C) 2001-2002 Patrick McHardy <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLASSIFY.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables qdisc classification target module");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_classify_target_info *clinfo = targinfo;
+
+	if((*pskb)->priority != clinfo->priority) {
+		(*pskb)->priority = clinfo->priority;
+		(*pskb)->nfcache |= NFC_ALTERED;
+	}
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){
+		printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_classify_target_info)));
+		return 0;
+	}
+	
+	if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) |
+	                  (1 << NF_IP_POST_ROUTING))) {
+		printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD "
+		                "and POST_ROUTING.\n");
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_ERR "CLASSIFY: can only be called from "
+		                "\"mangle\" table, not \"%s\".\n",
+		                tablename);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_classify_reg = { 
+	.name 		= "CLASSIFY", 
+	.target 	= target,
+	.checkentry	= checkentry,
+	.me 		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_classify_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_classify_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
new file mode 100644
index 000000000000..0f12e3a3dc73
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -0,0 +1,761 @@
+/* Cluster IP hashmark target 
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ * based on ideas of Fabio Olive Leite <olive@unixforge.org>
+ *
+ * Development of this code funded by SuSE Linux AG, http://www.suse.com/
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/proc_fs.h>
+#include <linux/jhash.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/if_arp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+#include <net/checksum.h>
+
+#include <linux/netfilter_arp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+#define CLUSTERIP_VERSION "0.6"
+
+#define DEBUG_CLUSTERIP
+
+#ifdef DEBUG_CLUSTERIP
+#define DEBUGP	printk
+#else
+#define DEBUGP
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables target for CLUSTERIP");
+
+struct clusterip_config {
+	struct list_head list;			/* list of all configs */
+	atomic_t refcount;			/* reference count */
+
+	u_int32_t clusterip;			/* the IP address */
+	u_int8_t clustermac[ETH_ALEN];		/* the MAC address */
+	struct net_device *dev;			/* device */
+	u_int16_t num_total_nodes;		/* total number of nodes */
+	u_int16_t num_local_nodes;		/* number of local nodes */
+	u_int16_t local_nodes[CLUSTERIP_MAX_NODES];	/* node number array */
+
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *pde;		/* proc dir entry */
+#endif
+	enum clusterip_hashmode hash_mode;	/* which hashing mode */
+	u_int32_t hash_initval;			/* hash initialization */
+};
+
+static LIST_HEAD(clusterip_configs);
+
+/* clusterip_lock protects the clusterip_configs list _AND_ the configurable
+ * data within all structurses (num_local_nodes, local_nodes[]) */
+static DECLARE_RWLOCK(clusterip_lock);
+
+#ifdef CONFIG_PROC_FS
+static struct file_operations clusterip_proc_fops;
+static struct proc_dir_entry *clusterip_procdir;
+#endif
+
+static inline void
+clusterip_config_get(struct clusterip_config *c) {
+	atomic_inc(&c->refcount);
+}
+
+static inline void
+clusterip_config_put(struct clusterip_config *c) {
+	if (atomic_dec_and_test(&c->refcount)) {
+		WRITE_LOCK(&clusterip_lock);
+		list_del(&c->list);
+		WRITE_UNLOCK(&clusterip_lock);
+		dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0);
+		dev_put(c->dev);
+		kfree(c);
+	}
+}
+
+
+static struct clusterip_config *
+__clusterip_config_find(u_int32_t clusterip)
+{
+	struct list_head *pos;
+
+	MUST_BE_READ_LOCKED(&clusterip_lock);
+	list_for_each(pos, &clusterip_configs) {
+		struct clusterip_config *c = list_entry(pos, 
+					struct clusterip_config, list);
+		if (c->clusterip == clusterip) {
+			return c;
+		}
+	}
+
+	return NULL;
+}
+
+static inline struct clusterip_config *
+clusterip_config_find_get(u_int32_t clusterip)
+{
+	struct clusterip_config *c;
+
+	READ_LOCK(&clusterip_lock);
+	c = __clusterip_config_find(clusterip);
+	if (!c) {
+		READ_UNLOCK(&clusterip_lock);
+		return NULL;
+	}
+	atomic_inc(&c->refcount);
+	READ_UNLOCK(&clusterip_lock);
+
+	return c;
+}
+
+static struct clusterip_config *
+clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip,
+			struct net_device *dev)
+{
+	struct clusterip_config *c;
+	char buffer[16];
+
+	c = kmalloc(sizeof(*c), GFP_ATOMIC);
+	if (!c)
+		return NULL;
+
+	memset(c, 0, sizeof(*c));
+	c->dev = dev;
+	c->clusterip = ip;
+	memcpy(&c->clustermac, &i->clustermac, ETH_ALEN);
+	c->num_total_nodes = i->num_total_nodes;
+	c->num_local_nodes = i->num_local_nodes;
+	memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes));
+	c->hash_mode = i->hash_mode;
+	c->hash_initval = i->hash_initval;
+	atomic_set(&c->refcount, 1);
+
+#ifdef CONFIG_PROC_FS
+	/* create proc dir entry */
+	sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip));
+	c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir);
+	if (!c->pde) {
+		kfree(c);
+		return NULL;
+	}
+	c->pde->proc_fops = &clusterip_proc_fops;
+	c->pde->data = c;
+#endif
+
+	WRITE_LOCK(&clusterip_lock);
+	list_add(&c->list, &clusterip_configs);
+	WRITE_UNLOCK(&clusterip_lock);
+
+	return c;
+}
+
+static int
+clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	int i;
+
+	WRITE_LOCK(&clusterip_lock);
+
+	if (c->num_local_nodes >= CLUSTERIP_MAX_NODES
+	    || nodenum > CLUSTERIP_MAX_NODES) {
+		WRITE_UNLOCK(&clusterip_lock);
+		return 1;
+	}
+
+	/* check if we alrady have this number in our array */
+	for (i = 0; i < c->num_local_nodes; i++) {
+		if (c->local_nodes[i] == nodenum) {
+			WRITE_UNLOCK(&clusterip_lock);
+			return 1;
+		}
+	}
+
+	c->local_nodes[c->num_local_nodes++] = nodenum;
+
+	WRITE_UNLOCK(&clusterip_lock);
+	return 0;
+}
+
+static int
+clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum)
+{
+	int i;
+
+	WRITE_LOCK(&clusterip_lock);
+
+	if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) {
+		WRITE_UNLOCK(&clusterip_lock);
+		return 1;
+	}
+		
+	for (i = 0; i < c->num_local_nodes; i++) {
+		if (c->local_nodes[i] == nodenum) {
+			int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1));
+			memmove(&c->local_nodes[i], &c->local_nodes[i+1], size);
+			c->num_local_nodes--;
+			WRITE_UNLOCK(&clusterip_lock);
+			return 0;
+		}
+	}
+
+	WRITE_UNLOCK(&clusterip_lock);
+	return 1;
+}
+
+static inline u_int32_t
+clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config)
+{
+	struct iphdr *iph = skb->nh.iph;
+	unsigned long hashval;
+	u_int16_t sport, dport;
+	struct tcphdr *th;
+	struct udphdr *uh;
+	struct icmphdr *ih;
+
+	switch (iph->protocol) {
+	case IPPROTO_TCP:
+		th = (void *)iph+iph->ihl*4;
+		sport = ntohs(th->source);
+		dport = ntohs(th->dest);
+		break;
+	case IPPROTO_UDP:
+		uh = (void *)iph+iph->ihl*4;
+		sport = ntohs(uh->source);
+		dport = ntohs(uh->dest);
+		break;
+	case IPPROTO_ICMP:
+		ih = (void *)iph+iph->ihl*4;
+		sport = ntohs(ih->un.echo.id);
+		dport = (ih->type<<8)|ih->code;
+		break;
+	default:
+		if (net_ratelimit()) {
+			printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n",
+				iph->protocol);
+		}
+		sport = dport = 0;
+	}
+
+	switch (config->hash_mode) {
+	case CLUSTERIP_HASHMODE_SIP:
+		hashval = jhash_1word(ntohl(iph->saddr),
+				      config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT:
+		hashval = jhash_2words(ntohl(iph->saddr), sport, 
+				       config->hash_initval);
+		break;
+	case CLUSTERIP_HASHMODE_SIP_SPT_DPT:
+		hashval = jhash_3words(ntohl(iph->saddr), sport, dport,
+				       config->hash_initval);
+		break;
+	default:
+		/* to make gcc happy */
+		hashval = 0;
+		/* This cannot happen, unless the check function wasn't called
+		 * at rule load time */
+		printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode);
+		BUG();
+		break;
+	}
+
+	/* node numbers are 1..n, not 0..n */
+	return ((hashval % config->num_total_nodes)+1);
+}
+
+static inline int
+clusterip_responsible(struct clusterip_config *config, u_int32_t hash)
+{
+	int i;
+
+	READ_LOCK(&clusterip_lock);
+
+	if (config->num_local_nodes == 0) {
+		READ_UNLOCK(&clusterip_lock);
+		return 0;
+	}
+
+	for (i = 0; i < config->num_local_nodes; i++) {
+		if (config->local_nodes[i] == hash) {
+			READ_UNLOCK(&clusterip_lock);
+			return 1;
+		}
+	}
+
+	READ_UNLOCK(&clusterip_lock);
+
+	return 0;
+}
+
+/*********************************************************************** 
+ * IPTABLES TARGET 
+ ***********************************************************************/
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+	u_int32_t hash;
+
+	/* don't need to clusterip_config_get() here, since refcount
+	 * is only decremented by destroy() - and ip_tables guarantees
+	 * that the ->target() function isn't called after ->destroy() */
+
+	if (!ct) {
+		printk(KERN_ERR "CLUSTERIP: no conntrack!\n");
+			/* FIXME: need to drop invalid ones, since replies
+			 * to outgoing connections of other nodes will be 
+			 * marked as INVALID */
+		return NF_DROP;
+	}
+
+	/* special case: ICMP error handling. conntrack distinguishes between
+	 * error messages (RELATED) and information requests (see below) */
+	if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP
+	    && (ctinfo == IP_CT_RELATED 
+		|| ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY))
+		return IPT_CONTINUE;
+
+	/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, 
+	 * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here
+	 * on, which all have an ID field [relevant for hashing]. */
+
+	hash = clusterip_hashfn(*pskb, cipinfo->config);
+
+	switch (ctinfo) {
+		case IP_CT_NEW:
+			ct->mark = hash;
+			break;
+		case IP_CT_RELATED:
+		case IP_CT_RELATED+IP_CT_IS_REPLY:
+			/* FIXME: we don't handle expectations at the
+			 * moment.  they can arrive on a different node than
+			 * the master connection (e.g. FTP passive mode) */
+		case IP_CT_ESTABLISHED:
+		case IP_CT_ESTABLISHED+IP_CT_IS_REPLY:
+			break;
+		default:
+			break;
+	}
+
+#ifdef DEBUG_CLUSTERP
+	DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+#endif
+	DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark);
+	if (!clusterip_responsible(cipinfo->config, hash)) {
+		DEBUGP("not responsible\n");
+		return NF_DROP;
+	}
+	DEBUGP("responsible\n");
+
+	/* despite being received via linklayer multicast, this is
+	 * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */
+	(*pskb)->pkt_type = PACKET_HOST;
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = targinfo;
+
+	struct clusterip_config *config;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) {
+		printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info)));
+		return 0;
+	}
+
+	if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT &&
+	    cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) {
+		printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n",
+			cipinfo->hash_mode);
+		return 0;
+
+	}
+	if (e->ip.dmsk.s_addr != 0xffffffff
+	    || e->ip.dst.s_addr == 0) {
+		printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
+		return 0;
+	}
+
+	/* FIXME: further sanity checks */
+
+	config = clusterip_config_find_get(e->ip.dst.s_addr);
+	if (!config) {
+		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
+			printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr));
+			return 0;
+		} else {
+			struct net_device *dev;
+
+			if (e->ip.iniface[0] == '\0') {
+				printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n");
+				return 0;
+			}
+
+			dev = dev_get_by_name(e->ip.iniface);
+			if (!dev) {
+				printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface);
+				return 0;
+			}
+
+			config = clusterip_config_init(cipinfo, 
+							e->ip.dst.s_addr, dev);
+			if (!config) {
+				printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n");
+				dev_put(dev);
+				return 0;
+			}
+			dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0);
+		}
+	}
+
+	cipinfo->config = config;
+
+	return 1;
+}
+
+/* drop reference count of cluster config when rule is deleted */
+static void destroy(void *matchinfo, unsigned int matchinfosize)
+{
+	struct ipt_clusterip_tgt_info *cipinfo = matchinfo;
+
+	/* we first remove the proc entry and then drop the reference
+	 * count.  In case anyone still accesses the file, the open/close
+	 * functions are also incrementing the refcount on their own */
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(cipinfo->config->pde->name,
+			  cipinfo->config->pde->parent);
+#endif
+	clusterip_config_put(cipinfo->config);
+}
+
+static struct ipt_target clusterip_tgt = { 
+	.name = "CLUSTERIP",
+	.target = &target, 
+	.checkentry = &checkentry, 
+	.destroy = &destroy,
+	.me = THIS_MODULE
+};
+
+
+/*********************************************************************** 
+ * ARP MANGLING CODE 
+ ***********************************************************************/
+
+/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */
+struct arp_payload {
+	u_int8_t src_hw[ETH_ALEN];
+	u_int32_t src_ip;
+	u_int8_t dst_hw[ETH_ALEN];
+	u_int32_t dst_ip;
+} __attribute__ ((packed));
+
+#ifdef CLUSTERIP_DEBUG
+static void arp_print(struct arp_payload *payload) 
+{
+#define HBUFFERLEN 30
+	char hbuffer[HBUFFERLEN];
+	int j,k;
+	const char hexbuf[]= "0123456789abcdef";
+
+	for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) {
+		hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15];
+		hbuffer[k++]=hexbuf[payload->src_hw[j]&15];
+		hbuffer[k++]=':';
+	}
+	hbuffer[--k]='\0';
+
+	printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", 
+		NIPQUAD(payload->src_ip), hbuffer,
+		NIPQUAD(payload->dst_ip));
+}
+#endif
+
+static unsigned int
+arp_mangle(unsigned int hook,
+	   struct sk_buff **pskb,
+	   const struct net_device *in,
+	   const struct net_device *out,
+	   int (*okfn)(struct sk_buff *))
+{
+	struct arphdr *arp = (*pskb)->nh.arph;
+	struct arp_payload *payload;
+	struct clusterip_config *c;
+
+	/* we don't care about non-ethernet and non-ipv4 ARP */
+	if (arp->ar_hrd != htons(ARPHRD_ETHER)
+	    || arp->ar_pro != htons(ETH_P_IP)
+	    || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+		return NF_ACCEPT;
+
+	/* we only want to mangle arp replies */
+	if (arp->ar_op != htons(ARPOP_REPLY))
+		return NF_ACCEPT;
+
+	payload = (void *)(arp+1);
+
+	/* if there is no clusterip configuration for the arp reply's 
+	 * source ip, we don't want to mangle it */
+	c = clusterip_config_find_get(payload->src_ip);
+	if (!c)
+		return NF_ACCEPT;
+
+	/* normally the linux kernel always replies to arp queries of 
+	 * addresses on different interfacs.  However, in the CLUSTERIP case
+	 * this wouldn't work, since we didn't subscribe the mcast group on
+	 * other interfaces */
+	if (c->dev != out) {
+		DEBUGP("CLUSTERIP: not mangling arp reply on different "
+		       "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name);
+		clusterip_config_put(c);
+		return NF_ACCEPT;
+	}
+
+	/* mangle reply hardware address */
+	memcpy(payload->src_hw, c->clustermac, arp->ar_hln);
+
+#ifdef CLUSTERIP_DEBUG
+	DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: ");
+	arp_print(payload);
+#endif
+
+	clusterip_config_put(c);
+
+	return NF_ACCEPT;
+}
+
+static struct nf_hook_ops cip_arp_ops = {
+	.hook = arp_mangle,
+	.pf = NF_ARP,
+	.hooknum = NF_ARP_OUT,
+	.priority = -1
+};
+
+/*********************************************************************** 
+ * PROC DIR HANDLING 
+ ***********************************************************************/
+
+#ifdef CONFIG_PROC_FS
+
+static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct clusterip_config *c = pde->data;
+	unsigned int *nodeidx;
+
+	READ_LOCK(&clusterip_lock);
+	if (*pos >= c->num_local_nodes)
+		return NULL;
+
+	nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL);
+	if (!nodeidx)
+		return ERR_PTR(-ENOMEM);
+
+	*nodeidx = *pos;
+	return nodeidx;
+}
+
+static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct clusterip_config *c = pde->data;
+	unsigned int *nodeidx = (unsigned int *)v;
+
+	*pos = ++(*nodeidx);
+	if (*pos >= c->num_local_nodes) {
+		kfree(v);
+		return NULL;
+	}
+	return nodeidx;
+}
+
+static void clusterip_seq_stop(struct seq_file *s, void *v)
+{
+	kfree(v);
+
+	READ_UNLOCK(&clusterip_lock);
+}
+
+static int clusterip_seq_show(struct seq_file *s, void *v)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct clusterip_config *c = pde->data;
+	unsigned int *nodeidx = (unsigned int *)v;
+
+	if (*nodeidx != 0) 
+		seq_putc(s, ',');
+	seq_printf(s, "%u", c->local_nodes[*nodeidx]);
+
+	if (*nodeidx == c->num_local_nodes-1)
+		seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations clusterip_seq_ops = {
+	.start	= clusterip_seq_start,
+	.next	= clusterip_seq_next,
+	.stop	= clusterip_seq_stop,
+	.show	= clusterip_seq_show,
+};
+
+static int clusterip_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &clusterip_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+		struct proc_dir_entry *pde = PDE(inode);
+		struct clusterip_config *c = pde->data;
+
+		sf->private = pde;
+
+		clusterip_config_get(c);
+	}
+
+	return ret;
+}
+
+static int clusterip_proc_release(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *pde = PDE(inode);
+	struct clusterip_config *c = pde->data;
+	int ret;
+
+	ret = seq_release(inode, file);
+
+	if (!ret)
+		clusterip_config_put(c);
+
+	return ret;
+}
+
+static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
+				size_t size, loff_t *ofs)
+{
+#define PROC_WRITELEN	10
+	char buffer[PROC_WRITELEN+1];
+	struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode);
+	struct clusterip_config *c = pde->data;
+	unsigned long nodenum;
+
+	if (copy_from_user(buffer, input, PROC_WRITELEN))
+		return -EFAULT;
+
+	if (*buffer == '+') {
+		nodenum = simple_strtoul(buffer+1, NULL, 10);
+		if (clusterip_add_node(c, nodenum))
+			return -ENOMEM;
+	} else if (*buffer == '-') {
+		nodenum = simple_strtoul(buffer+1, NULL,10);
+		if (clusterip_del_node(c, nodenum))
+			return -ENOENT;
+	} else
+		return -EIO;
+
+	return size;
+}
+
+static struct file_operations clusterip_proc_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = clusterip_proc_open,
+	.read	 = seq_read,
+	.write	 = clusterip_proc_write,
+	.llseek	 = seq_lseek,
+	.release = clusterip_proc_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+
+static int init_or_cleanup(int fini)
+{
+	int ret;
+
+	if (fini)
+		goto cleanup;
+
+	if (ipt_register_target(&clusterip_tgt)) {
+		ret = -EINVAL;
+		goto cleanup_none;
+	}
+
+	if (nf_register_hook(&cip_arp_ops) < 0) {
+		ret = -EINVAL;
+		goto cleanup_target;
+	}
+
+#ifdef CONFIG_PROC_FS
+	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net);
+	if (!clusterip_procdir) {
+		printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n");
+		ret = -ENOMEM;
+		goto cleanup_hook;
+	}
+#endif /* CONFIG_PROC_FS */
+
+	printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n",
+		CLUSTERIP_VERSION);
+
+	return 0;
+
+cleanup:
+	printk(KERN_NOTICE "ClusterIP Version %s unloading\n",
+		CLUSTERIP_VERSION);
+#ifdef CONFIG_PROC_FS
+	remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+#endif
+cleanup_hook:
+	nf_unregister_hook(&cip_arp_ops);
+cleanup_target:
+	ipt_unregister_target(&clusterip_tgt);
+cleanup_none:
+	return -EINVAL;
+}
+
+static int __init init(void)
+{
+	return init_or_cleanup(0);
+}
+
+static void __exit fini(void)
+{
+	init_or_cleanup(1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c
new file mode 100644
index 000000000000..30ddd3e18eb7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_CONNMARK.c
@@ -0,0 +1,118 @@
+/* This kernel module is used to modify the connection mark values, or
+ * to optionally restore the skb nfmark from the connection mark
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables CONNMARK matching module");
+MODULE_LICENSE("GPL");
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_CONNMARK.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_connmark_target_info *markinfo = targinfo;
+	unsigned long diff;
+	unsigned long nfmark;
+	unsigned long newmark;
+
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo);
+	if (ct) {
+	    switch(markinfo->mode) {
+	    case IPT_CONNMARK_SET:
+		newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+		if (newmark != ct->mark)
+		    ct->mark = newmark;
+		break;
+	    case IPT_CONNMARK_SAVE:
+		newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask);
+		if (ct->mark != newmark)
+		    ct->mark = newmark;
+		break;
+	    case IPT_CONNMARK_RESTORE:
+		nfmark = (*pskb)->nfmark;
+		diff = (ct->mark ^ nfmark) & markinfo->mask;
+		if (diff != 0) {
+		    (*pskb)->nfmark = nfmark ^ diff;
+		    (*pskb)->nfcache |= NFC_ALTERED;
+		}
+		break;
+	    }
+	}
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+	   void *targinfo,
+	   unsigned int targinfosize,
+	   unsigned int hook_mask)
+{
+	struct ipt_connmark_target_info *matchinfo = targinfo;
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) {
+		printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_connmark_target_info)));
+		return 0;
+	}
+
+	if (matchinfo->mode == IPT_CONNMARK_RESTORE) {
+	    if (strcmp(tablename, "mangle") != 0) {
+		    printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		    return 0;
+	    }
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_connmark_reg = {
+	.name = "CONNMARK",
+	.target = &target,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_connmark_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_connmark_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c
new file mode 100644
index 000000000000..3ea4509099f9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_DSCP.c
@@ -0,0 +1,106 @@
+/* iptables module for setting the IPv4 DSCP field, Version 1.8
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh <mgm@paktronix.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ * 
+ * See RFC2474 for a description of the DSCP field within the IP Header.
+ *
+ * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp
+*/
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_DSCP.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables DSCP modification module");
+MODULE_LICENSE("GPL");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_DSCP_info *dinfo = targinfo;
+	u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
+
+
+	if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) {
+		u_int16_t diffs[2];
+
+		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+			return NF_DROP;
+
+		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+		(*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK)
+			| sh_dscp;
+		diffs[1] = htons((*pskb)->nh.iph->tos);
+		(*pskb)->nh.iph->check
+			= csum_fold(csum_partial((char *)diffs,
+						 sizeof(diffs),
+						 (*pskb)->nh.iph->check
+						 ^ 0xFFFF));
+		(*pskb)->nfcache |= NFC_ALTERED;
+	}
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) {
+		printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_DSCP_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if ((dscp > IPT_DSCP_MAX)) {
+		printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_dscp_reg = {
+	.name		= "DSCP",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_dscp_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_dscp_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
new file mode 100644
index 000000000000..ada9911118e9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -0,0 +1,175 @@
+/* iptables module for the IPv4 and TCP ECN bits, Version 1.5
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as 
+ * published by the Free Software Foundation.
+ *
+ * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp
+*/
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ECN.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables ECN modification module");
+
+/* set ECT codepoint from IP header.
+ * 	return 0 if there was an error. */
+static inline int
+set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo)
+{
+	if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK)
+	    != (einfo->ip_ect & IPT_ECN_IP_MASK)) {
+		u_int16_t diffs[2];
+
+		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+			return 0;
+
+		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+		(*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK;
+		(*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK);
+		diffs[1] = htons((*pskb)->nh.iph->tos);
+		(*pskb)->nh.iph->check
+			= csum_fold(csum_partial((char *)diffs,
+						 sizeof(diffs),
+						 (*pskb)->nh.iph->check
+						 ^0xFFFF));
+		(*pskb)->nfcache |= NFC_ALTERED;
+	} 
+	return 1;
+}
+
+/* Return 0 if there was an error. */
+static inline int
+set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward)
+{
+	struct tcphdr _tcph, *tcph;
+	u_int16_t diffs[2];
+
+	/* Not enought header? */
+	tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4,
+				  sizeof(_tcph), &_tcph);
+	if (!tcph)
+		return 0;
+
+	if (!(einfo->operation & IPT_ECN_OP_SET_ECE
+	      || tcph->ece == einfo->proto.tcp.ece)
+	    && (!(einfo->operation & IPT_ECN_OP_SET_CWR
+		  || tcph->cwr == einfo->proto.tcp.cwr)))
+		return 1;
+
+	if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph)))
+		return 0;
+	tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4;
+
+	diffs[0] = ((u_int16_t *)tcph)[6];
+	if (einfo->operation & IPT_ECN_OP_SET_ECE)
+		tcph->ece = einfo->proto.tcp.ece;
+	if (einfo->operation & IPT_ECN_OP_SET_CWR)
+		tcph->cwr = einfo->proto.tcp.cwr;
+	diffs[1] = ((u_int16_t *)tcph)[6];
+	diffs[0] = diffs[0] ^ 0xFFFF;
+
+	if ((*pskb)->ip_summed != CHECKSUM_HW)
+		tcph->check = csum_fold(csum_partial((char *)diffs,
+						     sizeof(diffs),
+						     tcph->check^0xFFFF));
+	else
+		if (skb_checksum_help(*pskb, inward))
+			return 0;
+	(*pskb)->nfcache |= NFC_ALTERED;
+	return 1;
+}
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_ECN_info *einfo = targinfo;
+
+	if (einfo->operation & IPT_ECN_OP_SET_IP)
+		if (!set_ect_ip(pskb, einfo))
+			return NF_DROP;
+
+	if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
+	    && (*pskb)->nh.iph->protocol == IPPROTO_TCP)
+		if (!set_ect_tcp(pskb, einfo, (out == NULL)))
+			return NF_DROP;
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) {
+		printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_ECN_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (einfo->operation & IPT_ECN_OP_MASK) {
+		printk(KERN_WARNING "ECN: unsupported ECN operation %x\n",
+			einfo->operation);
+		return 0;
+	}
+	if (einfo->ip_ect & ~IPT_ECN_IP_MASK) {
+		printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n",
+			einfo->ip_ect);
+		return 0;
+	}
+
+	if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
+	    && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) {
+		printk(KERN_WARNING "ECN: cannot use TCP operations on a "
+		       "non-tcp rule\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_ecn_reg = {
+	.name		= "ECN",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_ecn_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_ecn_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
new file mode 100644
index 000000000000..ef08733d26da
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -0,0 +1,485 @@
+/*
+ * This is a module which is used for logging packets.
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/tcp.h>
+#include <net/route.h>
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_LOG.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables syslog logging module");
+
+static unsigned int nflog = 1;
+module_param(nflog, int, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+ 
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Use lock to serialize, so printks don't overlap */
+static DEFINE_SPINLOCK(log_lock);
+
+/* One level of recursion won't kill us */
+static void dump_packet(const struct ipt_log_info *info,
+			const struct sk_buff *skb,
+			unsigned int iphoff)
+{
+	struct iphdr _iph, *ih;
+
+	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
+	if (ih == NULL) {
+		printk("TRUNCATED");
+		return;
+	}
+
+	/* Important fields:
+	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
+	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
+	printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ",
+	       NIPQUAD(ih->saddr), NIPQUAD(ih->daddr));
+
+	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
+	printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
+	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
+
+	/* Max length: 6 "CE DF MF " */
+	if (ntohs(ih->frag_off) & IP_CE)
+		printk("CE ");
+	if (ntohs(ih->frag_off) & IP_DF)
+		printk("DF ");
+	if (ntohs(ih->frag_off) & IP_MF)
+		printk("MF ");
+
+	/* Max length: 11 "FRAG:65535 " */
+	if (ntohs(ih->frag_off) & IP_OFFSET)
+		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+
+	if ((info->logflags & IPT_LOG_IPOPT)
+	    && ih->ihl * 4 > sizeof(struct iphdr)) {
+		unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op;
+		unsigned int i, optsize;
+
+		optsize = ih->ihl * 4 - sizeof(struct iphdr);
+		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
+					optsize, _opt);
+		if (op == NULL) {
+			printk("TRUNCATED");
+			return;
+		}
+
+		/* Max length: 127 "OPT (" 15*4*2chars ") " */
+		printk("OPT (");
+		for (i = 0; i < optsize; i++)
+			printk("%02X", op[i]);
+		printk(") ");
+	}
+
+	switch (ih->protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph, *th;
+
+		/* Max length: 10 "PROTO=TCP " */
+		printk("PROTO=TCP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+					sizeof(_tcph), &_tcph);
+		if (th == NULL) {
+			printk("INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 20 "SPT=65535 DPT=65535 " */
+		printk("SPT=%u DPT=%u ",
+		       ntohs(th->source), ntohs(th->dest));
+		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
+		if (info->logflags & IPT_LOG_TCPSEQ)
+			printk("SEQ=%u ACK=%u ",
+			       ntohl(th->seq), ntohl(th->ack_seq));
+		/* Max length: 13 "WINDOW=65535 " */
+		printk("WINDOW=%u ", ntohs(th->window));
+		/* Max length: 9 "RES=0x3F " */
+		printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
+		if (th->cwr)
+			printk("CWR ");
+		if (th->ece)
+			printk("ECE ");
+		if (th->urg)
+			printk("URG ");
+		if (th->ack)
+			printk("ACK ");
+		if (th->psh)
+			printk("PSH ");
+		if (th->rst)
+			printk("RST ");
+		if (th->syn)
+			printk("SYN ");
+		if (th->fin)
+			printk("FIN ");
+		/* Max length: 11 "URGP=65535 " */
+		printk("URGP=%u ", ntohs(th->urg_ptr));
+
+		if ((info->logflags & IPT_LOG_TCPOPT)
+		    && th->doff * 4 > sizeof(struct tcphdr)) {
+			unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
+			unsigned char *op;
+			unsigned int i, optsize;
+
+			optsize = th->doff * 4 - sizeof(struct tcphdr);
+			op = skb_header_pointer(skb,
+						iphoff+ih->ihl*4+sizeof(_tcph),
+						optsize, _opt);
+			if (op == NULL) {
+				printk("TRUNCATED");
+				return;
+			}
+
+			/* Max length: 127 "OPT (" 15*4*2chars ") " */
+			printk("OPT (");
+			for (i = 0; i < optsize; i++)
+				printk("%02X", op[i]);
+			printk(") ");
+		}
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr _udph, *uh;
+
+		/* Max length: 10 "PROTO=UDP " */
+		printk("PROTO=UDP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_udph), &_udph);
+		if (uh == NULL) {
+			printk("INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 20 "SPT=65535 DPT=65535 " */
+		printk("SPT=%u DPT=%u LEN=%u ",
+		       ntohs(uh->source), ntohs(uh->dest),
+		       ntohs(uh->len));
+		break;
+	}
+	case IPPROTO_ICMP: {
+		struct icmphdr _icmph, *ich;
+		static size_t required_len[NR_ICMP_TYPES+1]
+			= { [ICMP_ECHOREPLY] = 4,
+			    [ICMP_DEST_UNREACH]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_SOURCE_QUENCH]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_REDIRECT]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_ECHO] = 4,
+			    [ICMP_TIME_EXCEEDED]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_PARAMETERPROB]
+			    = 8 + sizeof(struct iphdr),
+			    [ICMP_TIMESTAMP] = 20,
+			    [ICMP_TIMESTAMPREPLY] = 20,
+			    [ICMP_ADDRESS] = 12,
+			    [ICMP_ADDRESSREPLY] = 12 };
+
+		/* Max length: 11 "PROTO=ICMP " */
+		printk("PROTO=ICMP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
+					 sizeof(_icmph), &_icmph);
+		if (ich == NULL) {
+			printk("INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Max length: 18 "TYPE=255 CODE=255 " */
+		printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		if (ich->type <= NR_ICMP_TYPES
+		    && required_len[ich->type]
+		    && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+			printk("INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		switch (ich->type) {
+		case ICMP_ECHOREPLY:
+		case ICMP_ECHO:
+			/* Max length: 19 "ID=65535 SEQ=65535 " */
+			printk("ID=%u SEQ=%u ",
+			       ntohs(ich->un.echo.id),
+			       ntohs(ich->un.echo.sequence));
+			break;
+
+		case ICMP_PARAMETERPROB:
+			/* Max length: 14 "PARAMETER=255 " */
+			printk("PARAMETER=%u ",
+			       ntohl(ich->un.gateway) >> 24);
+			break;
+		case ICMP_REDIRECT:
+			/* Max length: 24 "GATEWAY=255.255.255.255 " */
+			printk("GATEWAY=%u.%u.%u.%u ",
+			       NIPQUAD(ich->un.gateway));
+			/* Fall through */
+		case ICMP_DEST_UNREACH:
+		case ICMP_SOURCE_QUENCH:
+		case ICMP_TIME_EXCEEDED:
+			/* Max length: 3+maxlen */
+			if (!iphoff) { /* Only recurse once. */
+				printk("[");
+				dump_packet(info, skb,
+					    iphoff + ih->ihl*4+sizeof(_icmph));
+				printk("] ");
+			}
+
+			/* Max length: 10 "MTU=65535 " */
+			if (ich->type == ICMP_DEST_UNREACH
+			    && ich->code == ICMP_FRAG_NEEDED)
+				printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+		}
+		break;
+	}
+	/* Max Length */
+	case IPPROTO_AH: {
+		struct ip_auth_hdr _ahdr, *ah;
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+		
+		/* Max length: 9 "PROTO=AH " */
+		printk("PROTO=AH ");
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_ahdr), &_ahdr);
+		if (ah == NULL) {
+			printk("INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Length: 15 "SPI=0xF1234567 " */
+		printk("SPI=0x%x ", ntohl(ah->spi));
+		break;
+	}
+	case IPPROTO_ESP: {
+		struct ip_esp_hdr _esph, *eh;
+
+		/* Max length: 10 "PROTO=ESP " */
+		printk("PROTO=ESP ");
+
+		if (ntohs(ih->frag_off) & IP_OFFSET)
+			break;
+
+		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
+		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
+					sizeof(_esph), &_esph);
+		if (eh == NULL) {
+			printk("INCOMPLETE [%u bytes] ",
+			       skb->len - iphoff - ih->ihl*4);
+			break;
+		}
+
+		/* Length: 15 "SPI=0xF1234567 " */
+		printk("SPI=0x%x ", ntohl(eh->spi));
+		break;
+	}
+	/* Max length: 10 "PROTO 255 " */
+	default:
+		printk("PROTO=%u ", ih->protocol);
+	}
+
+	/* Max length: 15 "UID=4294967295 " */
+ 	if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
+		read_lock_bh(&skb->sk->sk_callback_lock);
+		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
+ 			printk("UID=%u ", skb->sk->sk_socket->file->f_uid);
+		read_unlock_bh(&skb->sk->sk_callback_lock);
+	}
+
+	/* Proto    Max log string length */
+	/* IP:      40+46+6+11+127 = 230 */
+	/* TCP:     10+max(25,20+30+13+9+32+11+127) = 252 */
+	/* UDP:     10+max(25,20) = 35 */
+	/* ICMP:    11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */
+	/* ESP:     10+max(25)+15 = 50 */
+	/* AH:      9+max(25)+15 = 49 */
+	/* unknown: 10 */
+
+	/* (ICMP allows recursion one level deep) */
+	/* maxlen =  IP + ICMP +  IP + max(TCP,UDP,ICMP,unknown) */
+	/* maxlen = 230+   91  + 230 + 252 = 803 */
+}
+
+static void
+ipt_log_packet(unsigned int hooknum,
+	       const struct sk_buff *skb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       const struct ipt_log_info *loginfo,
+	       const char *level_string,
+	       const char *prefix)
+{
+	spin_lock_bh(&log_lock);
+	printk(level_string);
+	printk("%sIN=%s OUT=%s ",
+	       prefix == NULL ? loginfo->prefix : prefix,
+	       in ? in->name : "",
+	       out ? out->name : "");
+#ifdef CONFIG_BRIDGE_NETFILTER
+	if (skb->nf_bridge) {
+		struct net_device *physindev = skb->nf_bridge->physindev;
+		struct net_device *physoutdev = skb->nf_bridge->physoutdev;
+
+		if (physindev && in != physindev)
+			printk("PHYSIN=%s ", physindev->name);
+		if (physoutdev && out != physoutdev)
+			printk("PHYSOUT=%s ", physoutdev->name);
+	}
+#endif
+
+	if (in && !out) {
+		/* MAC logging for input chain only. */
+		printk("MAC=");
+		if (skb->dev && skb->dev->hard_header_len
+		    && skb->mac.raw != (void*)skb->nh.iph) {
+			int i;
+			unsigned char *p = skb->mac.raw;
+			for (i = 0; i < skb->dev->hard_header_len; i++,p++)
+				printk("%02x%c", *p,
+				       i==skb->dev->hard_header_len - 1
+				       ? ' ':':');
+		} else
+			printk(" ");
+	}
+
+	dump_packet(loginfo, skb, 0);
+	printk("\n");
+	spin_unlock_bh(&log_lock);
+}
+
+static unsigned int
+ipt_log_target(struct sk_buff **pskb,
+	       const struct net_device *in,
+	       const struct net_device *out,
+	       unsigned int hooknum,
+	       const void *targinfo,
+	       void *userinfo)
+{
+	const struct ipt_log_info *loginfo = targinfo;
+	char level_string[4] = "< >";
+
+	level_string[1] = '0' + (loginfo->level % 8);
+	ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL);
+
+	return IPT_CONTINUE;
+}
+
+static void
+ipt_logfn(unsigned int hooknum,
+	  const struct sk_buff *skb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  const char *prefix)
+{
+	struct ipt_log_info loginfo = { 
+		.level = 0, 
+		.logflags = IPT_LOG_MASK, 
+		.prefix = "" 
+	};
+
+	ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix);
+}
+
+static int ipt_log_checkentry(const char *tablename,
+			      const struct ipt_entry *e,
+			      void *targinfo,
+			      unsigned int targinfosize,
+			      unsigned int hook_mask)
+{
+	const struct ipt_log_info *loginfo = targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) {
+		DEBUGP("LOG: targinfosize %u != %u\n",
+		       targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info)));
+		return 0;
+	}
+
+	if (loginfo->level >= 8) {
+		DEBUGP("LOG: level %u >= 8\n", loginfo->level);
+		return 0;
+	}
+
+	if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') {
+		DEBUGP("LOG: prefix term %i\n",
+		       loginfo->prefix[sizeof(loginfo->prefix)-1]);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_log_reg = {
+	.name		= "LOG",
+	.target		= ipt_log_target,
+	.checkentry	= ipt_log_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	if (ipt_register_target(&ipt_log_reg))
+		return -EINVAL;
+	if (nflog)
+		nf_log_register(PF_INET, &ipt_logfn);
+	
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	if (nflog)
+		nf_log_unregister(PF_INET, &ipt_logfn);
+	ipt_unregister_target(&ipt_log_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c
new file mode 100644
index 000000000000..33c6f9b63b8d
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MARK.c
@@ -0,0 +1,162 @@
+/* This is a module which is used for setting the NFMARK field of an skb. */
+
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_MARK.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables MARK modification module");
+
+static unsigned int
+target_v0(struct sk_buff **pskb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  unsigned int hooknum,
+	  const void *targinfo,
+	  void *userinfo)
+{
+	const struct ipt_mark_target_info *markinfo = targinfo;
+
+	if((*pskb)->nfmark != markinfo->mark) {
+		(*pskb)->nfmark = markinfo->mark;
+		(*pskb)->nfcache |= NFC_ALTERED;
+	}
+	return IPT_CONTINUE;
+}
+
+static unsigned int
+target_v1(struct sk_buff **pskb,
+	  const struct net_device *in,
+	  const struct net_device *out,
+	  unsigned int hooknum,
+	  const void *targinfo,
+	  void *userinfo)
+{
+	const struct ipt_mark_target_info_v1 *markinfo = targinfo;
+	int mark = 0;
+
+	switch (markinfo->mode) {
+	case IPT_MARK_SET:
+		mark = markinfo->mark;
+		break;
+		
+	case IPT_MARK_AND:
+		mark = (*pskb)->nfmark & markinfo->mark;
+		break;
+		
+	case IPT_MARK_OR:
+		mark = (*pskb)->nfmark | markinfo->mark;
+		break;
+	}
+
+	if((*pskb)->nfmark != mark) {
+		(*pskb)->nfmark = mark;
+		(*pskb)->nfcache |= NFC_ALTERED;
+	}
+	return IPT_CONTINUE;
+}
+
+
+static int
+checkentry_v0(const char *tablename,
+	      const struct ipt_entry *e,
+	      void *targinfo,
+	      unsigned int targinfosize,
+	      unsigned int hook_mask)
+{
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) {
+		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_mark_target_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int
+checkentry_v1(const char *tablename,
+	      const struct ipt_entry *e,
+	      void *targinfo,
+	      unsigned int targinfosize,
+	      unsigned int hook_mask)
+{
+	struct ipt_mark_target_info_v1 *markinfo = targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){
+		printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (markinfo->mode != IPT_MARK_SET
+	    && markinfo->mode != IPT_MARK_AND
+	    && markinfo->mode != IPT_MARK_OR) {
+		printk(KERN_WARNING "MARK: unknown mode %u\n",
+		       markinfo->mode);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_mark_reg_v0 = {
+	.name		= "MARK",
+	.target		= target_v0,
+	.checkentry	= checkentry_v0,
+	.me		= THIS_MODULE,
+	.revision	= 0,
+};
+
+static struct ipt_target ipt_mark_reg_v1 = {
+	.name		= "MARK",
+	.target		= target_v1,
+	.checkentry	= checkentry_v1,
+	.me		= THIS_MODULE,
+	.revision	= 1,
+};
+
+static int __init init(void)
+{
+	int err;
+
+	err = ipt_register_target(&ipt_mark_reg_v0);
+	if (!err) {
+		err = ipt_register_target(&ipt_mark_reg_v1);
+		if (err)
+			ipt_unregister_target(&ipt_mark_reg_v0);
+	}
+	return err;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_mark_reg_v0);
+	ipt_unregister_target(&ipt_mark_reg_v1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
new file mode 100644
index 000000000000..57e9f6cf1c36
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -0,0 +1,207 @@
+/* Masquerade.  Simple mapping which alters range to a local IP address
+   (depending on route). */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables MASQUERADE target module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* Lock protects masq region inside conntrack */
+static DECLARE_RWLOCK(masq_lock);
+
+/* FIXME: Multiple targets. --RR */
+static int
+masquerade_check(const char *tablename,
+		 const struct ipt_entry *e,
+		 void *targinfo,
+		 unsigned int targinfosize,
+		 unsigned int hook_mask)
+{
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+
+	if (strcmp(tablename, "nat") != 0) {
+		DEBUGP("masquerade_check: bad table `%s'.\n", tablename);
+		return 0;
+	}
+	if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+		DEBUGP("masquerade_check: size %u != %u.\n",
+		       targinfosize, sizeof(*mr));
+		return 0;
+	}
+	if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) {
+		DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask);
+		return 0;
+	}
+	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+		DEBUGP("masquerade_check: bad MAP_IPS.\n");
+		return 0;
+	}
+	if (mr->rangesize != 1) {
+		DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize);
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned int
+masquerade_target(struct sk_buff **pskb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  unsigned int hooknum,
+		  const void *targinfo,
+		  void *userinfo)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	const struct ip_nat_multi_range_compat *mr;
+	struct ip_nat_range newrange;
+	struct rtable *rt;
+	u_int32_t newsrc;
+
+	IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING);
+
+	/* FIXME: For the moment, don't do local packets, breaks
+	   testsuite for 2.3.49 --RR */
+	if ((*pskb)->sk)
+		return NF_ACCEPT;
+
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+	IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
+	                    || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+
+	mr = targinfo;
+	rt = (struct rtable *)(*pskb)->dst;
+	newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE);
+	if (!newsrc) {
+		printk("MASQUERADE: %s ate my IP address\n", out->name);
+		return NF_DROP;
+	}
+
+	WRITE_LOCK(&masq_lock);
+	ct->nat.masq_index = out->ifindex;
+	WRITE_UNLOCK(&masq_lock);
+
+	/* Transfer from original range. */
+	newrange = ((struct ip_nat_range)
+		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+		  newsrc, newsrc,
+		  mr->range[0].min, mr->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static inline int
+device_cmp(struct ip_conntrack *i, void *ifindex)
+{
+	int ret;
+
+	READ_LOCK(&masq_lock);
+	ret = (i->nat.masq_index == (int)(long)ifindex);
+	READ_UNLOCK(&masq_lock);
+
+	return ret;
+}
+
+static int masq_device_event(struct notifier_block *this,
+			     unsigned long event,
+			     void *ptr)
+{
+	struct net_device *dev = ptr;
+
+	if (event == NETDEV_DOWN) {
+		/* Device was downed.  Search entire table for
+		   conntracks which were associated with that device,
+		   and forget them. */
+		IP_NF_ASSERT(dev->ifindex != 0);
+
+		ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static int masq_inet_event(struct notifier_block *this,
+			   unsigned long event,
+			   void *ptr)
+{
+	struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+
+	if (event == NETDEV_DOWN) {
+		/* IP address was deleted.  Search entire table for
+		   conntracks which were associated with that device,
+		   and forget them. */
+		IP_NF_ASSERT(dev->ifindex != 0);
+
+		ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block masq_dev_notifier = {
+	.notifier_call	= masq_device_event,
+};
+
+static struct notifier_block masq_inet_notifier = {
+	.notifier_call	= masq_inet_event,
+};
+
+static struct ipt_target masquerade = {
+	.name		= "MASQUERADE",
+	.target		= masquerade_target,
+	.checkentry	= masquerade_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	ret = ipt_register_target(&masquerade);
+
+	if (ret == 0) {
+		/* Register for device down reports */
+		register_netdevice_notifier(&masq_dev_notifier);
+		/* Register IP address change reports */
+		register_inetaddr_notifier(&masq_inet_notifier);
+	}
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&masquerade);
+	unregister_netdevice_notifier(&masq_dev_notifier);
+	unregister_inetaddr_notifier(&masq_inet_notifier);	
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c
new file mode 100644
index 000000000000..06254b29d034
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NETMAP.c
@@ -0,0 +1,117 @@
+/* NETMAP - static NAT mapping of IP network addresses (1:1).
+ * The mapping can be applied to source (POSTROUTING),
+ * destination (PREROUTING), or both (with separate rules).
+ */
+
+/* (C) 2000-2001 Svenning Soerensen <svenning@post5.tele.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/ip.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+
+#define MODULENAME "NETMAP"
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Svenning Soerensen <svenning@post5.tele.dk>");
+MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+check(const char *tablename,
+      const struct ipt_entry *e,
+      void *targinfo,
+      unsigned int targinfosize,
+      unsigned int hook_mask)
+{
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+
+	if (strcmp(tablename, "nat") != 0) {
+		DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename);
+		return 0;
+	}
+	if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+		DEBUGP(MODULENAME":check: size %u.\n", targinfosize);
+		return 0;
+	}
+	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) {
+		DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask);
+		return 0;
+	}
+	if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) {
+		DEBUGP(MODULENAME":check: bad MAP_IPS.\n");
+		return 0;
+	}
+	if (mr->rangesize != 1) {
+		DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize);
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t new_ip, netmask;
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+	struct ip_nat_range newrange;
+
+	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+		     || hooknum == NF_IP_POST_ROUTING);
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+
+	netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip);
+
+	if (hooknum == NF_IP_PRE_ROUTING)
+		new_ip = (*pskb)->nh.iph->daddr & ~netmask;
+	else
+		new_ip = (*pskb)->nh.iph->saddr & ~netmask;
+	new_ip |= mr->range[0].min_ip & netmask;
+
+	newrange = ((struct ip_nat_range)
+		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+		  new_ip, new_ip,
+		  mr->range[0].min, mr->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target target_module = { 
+	.name 		= MODULENAME,
+	.target 	= target, 
+	.checkentry 	= check,
+    	.me 		= THIS_MODULE 
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&target_module);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&target_module);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c
new file mode 100644
index 000000000000..a4bb9b3bc292
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_NOTRACK.c
@@ -0,0 +1,76 @@
+/* This is a module which is used for setting up fake conntracks
+ * on packets so that they are not seen by the conntrack/NAT code.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	/* Previously seen (loopback)? Ignore. */
+	if ((*pskb)->nfct != NULL)
+		return IPT_CONTINUE;
+
+	/* Attach fake conntrack entry. 
+	   If there is a real ct entry correspondig to this packet, 
+	   it'll hang aroun till timing out. We don't deal with it
+	   for performance reasons. JK */
+	(*pskb)->nfct = &ip_conntrack_untracked.ct_general;
+	(*pskb)->nfctinfo = IP_CT_NEW;
+	nf_conntrack_get((*pskb)->nfct);
+
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	if (targinfosize != 0) {
+		printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n",
+		       targinfosize);
+		return 0;
+	}
+
+	if (strcmp(tablename, "raw") != 0) {
+		printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_notrack_reg = { 
+	.name = "NOTRACK", 
+	.target = target, 
+	.checkentry = checkentry,
+	.me = THIS_MODULE 
+};
+
+static int __init init(void)
+{
+	if (ipt_register_target(&ipt_notrack_reg))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_notrack_reg);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
new file mode 100644
index 000000000000..d2e13447678e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -0,0 +1,129 @@
+/* Redirect.  Simple mapping which alters dst to a local IP address. */
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables REDIRECT target module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/* FIXME: Take multiple ranges --RR */
+static int
+redirect_check(const char *tablename,
+	       const struct ipt_entry *e,
+	       void *targinfo,
+	       unsigned int targinfosize,
+	       unsigned int hook_mask)
+{
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+
+	if (strcmp(tablename, "nat") != 0) {
+		DEBUGP("redirect_check: bad table `%s'.\n", table);
+		return 0;
+	}
+	if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+		DEBUGP("redirect_check: size %u.\n", targinfosize);
+		return 0;
+	}
+	if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) {
+		DEBUGP("redirect_check: bad hooks %x.\n", hook_mask);
+		return 0;
+	}
+	if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) {
+		DEBUGP("redirect_check: bad MAP_IPS.\n");
+		return 0;
+	}
+	if (mr->rangesize != 1) {
+		DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize);
+		return 0;
+	}
+	return 1;
+}
+
+static unsigned int
+redirect_target(struct sk_buff **pskb,
+		const struct net_device *in,
+		const struct net_device *out,
+		unsigned int hooknum,
+		const void *targinfo,
+		void *userinfo)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t newdst;
+	const struct ip_nat_multi_range_compat *mr = targinfo;
+	struct ip_nat_range newrange;
+
+	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING
+		     || hooknum == NF_IP_LOCAL_OUT);
+
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+	IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
+
+	/* Local packets: make them go to loopback */
+	if (hooknum == NF_IP_LOCAL_OUT)
+		newdst = htonl(0x7F000001);
+	else {
+		struct in_device *indev;
+
+		/* Device might not have an associated in_device. */
+		indev = (struct in_device *)(*pskb)->dev->ip_ptr;
+		if (indev == NULL || indev->ifa_list == NULL)
+			return NF_DROP;
+
+		/* Grab first address on interface. */
+		newdst = indev->ifa_list->ifa_local;
+	}
+
+	/* Transfer from original range. */
+	newrange = ((struct ip_nat_range)
+		{ mr->range[0].flags | IP_NAT_RANGE_MAP_IPS,
+		  newdst, newdst,
+		  mr->range[0].min, mr->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target redirect_reg = {
+	.name		= "REDIRECT",
+	.target		= redirect_target,
+	.checkentry	= redirect_check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&redirect_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&redirect_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
new file mode 100644
index 000000000000..266d64979286
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -0,0 +1,335 @@
+/*
+ * This is a module which is used for rejecting packets.
+ * Added support for customized reject packets (Jozsef Kadlecsik).
+ * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812]
+ */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/tcp.h>
+#include <net/route.h>
+#include <net/dst.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_REJECT.h>
+#ifdef CONFIG_BRIDGE_NETFILTER
+#include <linux/netfilter_bridge.h>
+#endif
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables REJECT target module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static inline struct rtable *route_reverse(struct sk_buff *skb, 
+					   struct tcphdr *tcph, int hook)
+{
+	struct iphdr *iph = skb->nh.iph;
+	struct dst_entry *odst;
+	struct flowi fl = {};
+	struct rtable *rt;
+
+	/* We don't require ip forwarding to be enabled to be able to
+	 * send a RST reply for bridged traffic. */
+	if (hook != NF_IP_FORWARD
+#ifdef CONFIG_BRIDGE_NETFILTER
+	    || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED)
+#endif
+	   ) {
+		fl.nl_u.ip4_u.daddr = iph->saddr;
+		if (hook == NF_IP_LOCAL_IN)
+			fl.nl_u.ip4_u.saddr = iph->daddr;
+		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+
+		if (ip_route_output_key(&rt, &fl) != 0)
+			return NULL;
+	} else {
+		/* non-local src, find valid iif to satisfy
+		 * rp-filter when calling ip_route_input. */
+		fl.nl_u.ip4_u.daddr = iph->daddr;
+		if (ip_route_output_key(&rt, &fl) != 0)
+			return NULL;
+
+		odst = skb->dst;
+		if (ip_route_input(skb, iph->saddr, iph->daddr,
+		                   RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
+			dst_release(&rt->u.dst);
+			return NULL;
+		}
+		dst_release(&rt->u.dst);
+		rt = (struct rtable *)skb->dst;
+		skb->dst = odst;
+
+		fl.nl_u.ip4_u.daddr = iph->saddr;
+		fl.nl_u.ip4_u.saddr = iph->daddr;
+		fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
+	}
+
+	if (rt->u.dst.error) {
+		dst_release(&rt->u.dst);
+		return NULL;
+	}
+
+	fl.proto = IPPROTO_TCP;
+	fl.fl_ip_sport = tcph->dest;
+	fl.fl_ip_dport = tcph->source;
+
+	if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) {
+		dst_release(&rt->u.dst);
+		rt = NULL;
+	}
+
+	return rt;
+}
+
+/* Send RST reply */
+static void send_reset(struct sk_buff *oldskb, int hook)
+{
+	struct sk_buff *nskb;
+	struct tcphdr _otcph, *oth, *tcph;
+	struct rtable *rt;
+	u_int16_t tmp_port;
+	u_int32_t tmp_addr;
+	int needs_ack;
+	int hh_len;
+
+	/* IP header checks: fragment. */
+	if (oldskb->nh.iph->frag_off & htons(IP_OFFSET))
+		return;
+
+	oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4,
+				 sizeof(_otcph), &_otcph);
+	if (oth == NULL)
+ 		return;
+
+	/* No RST for RST. */
+	if (oth->rst)
+		return;
+
+	/* FIXME: Check checksum --RR */
+	if ((rt = route_reverse(oldskb, oth, hook)) == NULL)
+		return;
+
+	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+	/* We need a linear, writeable skb.  We also need to expand
+	   headroom in case hh_len of incoming interface < hh_len of
+	   outgoing interface */
+	nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb),
+			       GFP_ATOMIC);
+	if (!nskb) {
+		dst_release(&rt->u.dst);
+		return;
+	}
+
+	dst_release(nskb->dst);
+	nskb->dst = &rt->u.dst;
+
+	/* This packet will not be the same as the other: clear nf fields */
+	nf_reset(nskb);
+	nskb->nfcache = 0;
+	nskb->nfmark = 0;
+#ifdef CONFIG_BRIDGE_NETFILTER
+	nf_bridge_put(nskb->nf_bridge);
+	nskb->nf_bridge = NULL;
+#endif
+
+	tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl);
+
+	/* Swap source and dest */
+	tmp_addr = nskb->nh.iph->saddr;
+	nskb->nh.iph->saddr = nskb->nh.iph->daddr;
+	nskb->nh.iph->daddr = tmp_addr;
+	tmp_port = tcph->source;
+	tcph->source = tcph->dest;
+	tcph->dest = tmp_port;
+
+	/* Truncate to length (no data) */
+	tcph->doff = sizeof(struct tcphdr)/4;
+	skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr));
+	nskb->nh.iph->tot_len = htons(nskb->len);
+
+	if (tcph->ack) {
+		needs_ack = 0;
+		tcph->seq = oth->ack_seq;
+		tcph->ack_seq = 0;
+	} else {
+		needs_ack = 1;
+		tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin
+				      + oldskb->len - oldskb->nh.iph->ihl*4
+				      - (oth->doff<<2));
+		tcph->seq = 0;
+	}
+
+	/* Reset flags */
+	((u_int8_t *)tcph)[13] = 0;
+	tcph->rst = 1;
+	tcph->ack = needs_ack;
+
+	tcph->window = 0;
+	tcph->urg_ptr = 0;
+
+	/* Adjust TCP checksum */
+	tcph->check = 0;
+	tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr),
+				   nskb->nh.iph->saddr,
+				   nskb->nh.iph->daddr,
+				   csum_partial((char *)tcph,
+						sizeof(struct tcphdr), 0));
+
+	/* Adjust IP TTL, DF */
+	nskb->nh.iph->ttl = MAXTTL;
+	/* Set DF, id = 0 */
+	nskb->nh.iph->frag_off = htons(IP_DF);
+	nskb->nh.iph->id = 0;
+
+	/* Adjust IP checksum */
+	nskb->nh.iph->check = 0;
+	nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, 
+					   nskb->nh.iph->ihl);
+
+	/* "Never happens" */
+	if (nskb->len > dst_mtu(nskb->dst))
+		goto free_nskb;
+
+	nf_ct_attach(nskb, oldskb);
+
+	NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev,
+		dst_output);
+	return;
+
+ free_nskb:
+	kfree_skb(nskb);
+}
+
+static inline void send_unreach(struct sk_buff *skb_in, int code)
+{
+	icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0);
+}	
+
+static unsigned int reject(struct sk_buff **pskb,
+			   const struct net_device *in,
+			   const struct net_device *out,
+			   unsigned int hooknum,
+			   const void *targinfo,
+			   void *userinfo)
+{
+	const struct ipt_reject_info *reject = targinfo;
+
+	/* Our naive response construction doesn't deal with IP
+           options, and probably shouldn't try. */
+	if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr))
+		return NF_DROP;
+
+	/* WARNING: This code causes reentry within iptables.
+	   This means that the iptables jump stack is now crap.  We
+	   must return an absolute verdict. --RR */
+    	switch (reject->with) {
+    	case IPT_ICMP_NET_UNREACHABLE:
+    		send_unreach(*pskb, ICMP_NET_UNREACH);
+    		break;
+    	case IPT_ICMP_HOST_UNREACHABLE:
+    		send_unreach(*pskb, ICMP_HOST_UNREACH);
+    		break;
+    	case IPT_ICMP_PROT_UNREACHABLE:
+    		send_unreach(*pskb, ICMP_PROT_UNREACH);
+    		break;
+    	case IPT_ICMP_PORT_UNREACHABLE:
+    		send_unreach(*pskb, ICMP_PORT_UNREACH);
+    		break;
+    	case IPT_ICMP_NET_PROHIBITED:
+    		send_unreach(*pskb, ICMP_NET_ANO);
+    		break;
+	case IPT_ICMP_HOST_PROHIBITED:
+    		send_unreach(*pskb, ICMP_HOST_ANO);
+    		break;
+    	case IPT_ICMP_ADMIN_PROHIBITED:
+		send_unreach(*pskb, ICMP_PKT_FILTERED);
+		break;
+	case IPT_TCP_RESET:
+		send_reset(*pskb, hooknum);
+	case IPT_ICMP_ECHOREPLY:
+		/* Doesn't happen. */
+		break;
+	}
+
+	return NF_DROP;
+}
+
+static int check(const char *tablename,
+		 const struct ipt_entry *e,
+		 void *targinfo,
+		 unsigned int targinfosize,
+		 unsigned int hook_mask)
+{
+ 	const struct ipt_reject_info *rejinfo = targinfo;
+
+ 	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) {
+  		DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize);
+  		return 0;
+  	}
+
+	/* Only allow these for packet filtering. */
+	if (strcmp(tablename, "filter") != 0) {
+		DEBUGP("REJECT: bad table `%s'.\n", tablename);
+		return 0;
+	}
+	if ((hook_mask & ~((1 << NF_IP_LOCAL_IN)
+			   | (1 << NF_IP_FORWARD)
+			   | (1 << NF_IP_LOCAL_OUT))) != 0) {
+		DEBUGP("REJECT: bad hook mask %X\n", hook_mask);
+		return 0;
+	}
+
+	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
+		printk("REJECT: ECHOREPLY no longer supported.\n");
+		return 0;
+	} else if (rejinfo->with == IPT_TCP_RESET) {
+		/* Must specify that it's a TCP packet */
+		if (e->ip.proto != IPPROTO_TCP
+		    || (e->ip.invflags & IPT_INV_PROTO)) {
+			DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n");
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_reject_reg = {
+	.name		= "REJECT",
+	.target		= reject,
+	.checkentry	= check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_reject_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_reject_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c
new file mode 100644
index 000000000000..7a0536d864ac
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_SAME.c
@@ -0,0 +1,211 @@
+/* Same.  Just like SNAT, only try to make the connections
+ * 	  between client A and server B always have the same source ip.
+ *
+ * (C) 2000 Paul `Rusty' Russell
+ * (C) 2001 Martin Josefsson
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * 010320 Martin Josefsson <gandalf@wlug.westbo.se>
+ * 	* copied ipt_BALANCE.c to ipt_SAME.c and changed a few things.
+ * 010728 Martin Josefsson <gandalf@wlug.westbo.se>
+ * 	* added --nodst to not include destination-ip in new source
+ * 	  calculations.
+ *	* added some more sanity-checks.
+ * 010729 Martin Josefsson <gandalf@wlug.westbo.se>
+ * 	* fixed a buggy if-statement in same_check(), should have
+ * 	  used ntohl() but didn't.
+ * 	* added support for multiple ranges. IPT_SAME_MAX_RANGE is
+ * 	  defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h
+ * 	  and is currently set to 10.
+ * 	* added support for 1-address range, nice to have now that
+ * 	  we have multiple ranges.
+ */
+#include <linux/types.h>
+#include <linux/ip.h>
+#include <linux/timer.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netdevice.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <net/protocol.h>
+#include <net/checksum.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ipt_SAME.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@wlug.westbo.se>");
+MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+same_check(const char *tablename,
+	      const struct ipt_entry *e,
+	      void *targinfo,
+	      unsigned int targinfosize,
+	      unsigned int hook_mask)
+{
+	unsigned int count, countess, rangeip, index = 0;
+	struct ipt_same_info *mr = targinfo;
+
+	mr->ipnum = 0;
+
+	if (strcmp(tablename, "nat") != 0) {
+		DEBUGP("same_check: bad table `%s'.\n", tablename);
+		return 0;
+	}
+	if (targinfosize != IPT_ALIGN(sizeof(*mr))) {
+		DEBUGP("same_check: size %u.\n", targinfosize);
+		return 0;
+	}
+	if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) {
+		DEBUGP("same_check: bad hooks %x.\n", hook_mask);
+		return 0;
+	}
+	if (mr->rangesize < 1) {
+		DEBUGP("same_check: need at least one dest range.\n");
+		return 0;
+	}
+	if (mr->rangesize > IPT_SAME_MAX_RANGE) {
+		DEBUGP("same_check: too many ranges specified, maximum "
+				"is %u ranges\n",
+				IPT_SAME_MAX_RANGE);
+		return 0;
+	}
+	for (count = 0; count < mr->rangesize; count++) {
+		if (ntohl(mr->range[count].min_ip) >
+				ntohl(mr->range[count].max_ip)) {
+			DEBUGP("same_check: min_ip is larger than max_ip in "
+				"range `%u.%u.%u.%u-%u.%u.%u.%u'.\n",
+				NIPQUAD(mr->range[count].min_ip),
+				NIPQUAD(mr->range[count].max_ip));
+			return 0;
+		}
+		if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) {
+			DEBUGP("same_check: bad MAP_IPS.\n");
+			return 0;
+		}
+		rangeip = (ntohl(mr->range[count].max_ip) - 
+					ntohl(mr->range[count].min_ip) + 1);
+		mr->ipnum += rangeip;
+		
+		DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip);
+	}
+	DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum);
+	
+	mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL);
+	if (!mr->iparray) {
+		DEBUGP("same_check: Couldn't allocate %u bytes "
+			"for %u ipaddresses!\n", 
+			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+		return 0;
+	}
+	DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n",
+			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+	
+	for (count = 0; count < mr->rangesize; count++) {
+		for (countess = ntohl(mr->range[count].min_ip);
+				countess <= ntohl(mr->range[count].max_ip);
+					countess++) {
+			mr->iparray[index] = countess;
+			DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' "
+				"in index %u.\n",
+				HIPQUAD(countess), index);
+			index++;
+		}
+	}
+	return 1;
+}
+
+static void 
+same_destroy(void *targinfo,
+		unsigned int targinfosize)
+{
+	struct ipt_same_info *mr = targinfo;
+
+	kfree(mr->iparray);
+	
+	DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n",
+			(sizeof(u_int32_t) * mr->ipnum), mr->ipnum);
+}
+
+static unsigned int
+same_target(struct sk_buff **pskb,
+		const struct net_device *in,
+		const struct net_device *out,
+		unsigned int hooknum,
+		const void *targinfo,
+		void *userinfo)
+{
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t tmpip, aindex, new_ip;
+	const struct ipt_same_info *same = targinfo;
+	struct ip_nat_range newrange;
+	const struct ip_conntrack_tuple *t;
+
+	IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING ||
+			hooknum == NF_IP_POST_ROUTING);
+	ct = ip_conntrack_get(*pskb, &ctinfo);
+
+	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+
+	/* Base new source on real src ip and optionally dst ip,
+	   giving some hope for consistency across reboots.
+	   Here we calculate the index in same->iparray which
+	   holds the ipaddress we should use */
+	
+	tmpip = ntohl(t->src.ip);
+
+	if (!(same->info & IPT_SAME_NODST))
+		tmpip += ntohl(t->dst.ip);
+	
+	aindex = tmpip % same->ipnum;
+
+	new_ip = htonl(same->iparray[aindex]);
+
+	DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, "
+			"new src=%u.%u.%u.%u\n",
+			NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip),
+			NIPQUAD(new_ip));
+
+	/* Transfer from original range. */
+	newrange = ((struct ip_nat_range)
+		{ same->range[0].flags, new_ip, new_ip,
+		  /* FIXME: Use ports from correct range! */
+		  same->range[0].min, same->range[0].max });
+
+	/* Hand modified range to generic setup. */
+	return ip_nat_setup_info(ct, &newrange, hooknum);
+}
+
+static struct ipt_target same_reg = { 
+	.name		= "SAME",
+	.target		= same_target,
+	.checkentry	= same_check,
+	.destroy	= same_destroy,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&same_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&same_reg);
+}
+
+module_init(init);
+module_exit(fini);
+
diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c
new file mode 100644
index 000000000000..1049050b2bfb
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TCPMSS.c
@@ -0,0 +1,262 @@
+/*
+ * This is a module which is used for setting the MSS option in TCP packets.
+ *
+ * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TCPMSS.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS modification module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static u_int16_t
+cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
+{
+	u_int32_t diffs[] = { oldvalinv, newval };
+	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
+                                      oldcheck^0xFFFF));
+}
+
+static inline unsigned int
+optlen(const u_int8_t *opt, unsigned int offset)
+{
+	/* Beware zero-length options: make finite progress */
+	if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1;
+	else return opt[offset+1];
+}
+
+static unsigned int
+ipt_tcpmss_target(struct sk_buff **pskb,
+		  const struct net_device *in,
+		  const struct net_device *out,
+		  unsigned int hooknum,
+		  const void *targinfo,
+		  void *userinfo)
+{
+	const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	u_int16_t tcplen, newtotlen, oldval, newmss;
+	unsigned int i;
+	u_int8_t *opt;
+
+	if (!skb_ip_make_writable(pskb, (*pskb)->len))
+		return NF_DROP;
+
+	iph = (*pskb)->nh.iph;
+	tcplen = (*pskb)->len - iph->ihl*4;
+
+	tcph = (void *)iph + iph->ihl*4;
+
+	/* Since it passed flags test in tcp match, we know it is is
+	   not a fragment, and has data >= tcp header length.  SYN
+	   packets should not contain data: if they did, then we risk
+	   running over MTU, sending Frag Needed and breaking things
+	   badly. --RR */
+	if (tcplen != tcph->doff*4) {
+		if (net_ratelimit())
+			printk(KERN_ERR
+			       "ipt_tcpmss_target: bad length (%d bytes)\n",
+			       (*pskb)->len);
+		return NF_DROP;
+	}
+
+	if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) {
+		if(!(*pskb)->dst) {
+			if (net_ratelimit())
+				printk(KERN_ERR
+			       		"ipt_tcpmss_target: no dst?! can't determine path-MTU\n");
+			return NF_DROP; /* or IPT_CONTINUE ?? */
+		}
+
+		if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) {
+			if (net_ratelimit())
+				printk(KERN_ERR
+		       			"ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst));
+			return NF_DROP; /* or IPT_CONTINUE ?? */
+		}
+
+		newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr);
+	} else
+		newmss = tcpmssinfo->mss;
+
+ 	opt = (u_int8_t *)tcph;
+	for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){
+		if ((opt[i] == TCPOPT_MSS) &&
+		    ((tcph->doff*4 - i) >= TCPOLEN_MSS) &&
+		    (opt[i+1] == TCPOLEN_MSS)) {
+			u_int16_t oldmss;
+
+			oldmss = (opt[i+2] << 8) | opt[i+3];
+
+			if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) &&
+				(oldmss <= newmss))
+					return IPT_CONTINUE;
+
+			opt[i+2] = (newmss & 0xff00) >> 8;
+			opt[i+3] = (newmss & 0x00ff);
+
+			tcph->check = cheat_check(htons(oldmss)^0xFFFF,
+						  htons(newmss),
+						  tcph->check);
+
+			DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
+			       "->%u.%u.%u.%u:%hu changed TCP MSS option"
+			       " (from %u to %u)\n", 
+			       NIPQUAD((*pskb)->nh.iph->saddr),
+			       ntohs(tcph->source),
+			       NIPQUAD((*pskb)->nh.iph->daddr),
+			       ntohs(tcph->dest),
+			       oldmss, newmss);
+			goto retmodified;
+		}
+	}
+
+	/*
+	 * MSS Option not found ?! add it..
+	 */
+	if (skb_tailroom((*pskb)) < TCPOLEN_MSS) {
+		struct sk_buff *newskb;
+
+		newskb = skb_copy_expand(*pskb, skb_headroom(*pskb),
+					 TCPOLEN_MSS, GFP_ATOMIC);
+		if (!newskb) {
+			if (net_ratelimit())
+				printk(KERN_ERR "ipt_tcpmss_target:"
+				       " unable to allocate larger skb\n");
+			return NF_DROP;
+		}
+
+		kfree_skb(*pskb);
+		*pskb = newskb;
+		iph = (*pskb)->nh.iph;
+		tcph = (void *)iph + iph->ihl*4;
+	}
+
+	skb_put((*pskb), TCPOLEN_MSS);
+
+ 	opt = (u_int8_t *)tcph + sizeof(struct tcphdr);
+	memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr));
+
+	tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF,
+				  htons(tcplen + TCPOLEN_MSS), tcph->check);
+	tcplen += TCPOLEN_MSS;
+
+	opt[0] = TCPOPT_MSS;
+	opt[1] = TCPOLEN_MSS;
+	opt[2] = (newmss & 0xff00) >> 8;
+	opt[3] = (newmss & 0x00ff);
+
+	tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check);
+
+	oldval = ((u_int16_t *)tcph)[6];
+	tcph->doff += TCPOLEN_MSS/4;
+	tcph->check = cheat_check(oldval ^ 0xFFFF,
+				  ((u_int16_t *)tcph)[6], tcph->check);
+
+	newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS);
+	iph->check = cheat_check(iph->tot_len ^ 0xFFFF,
+				 newtotlen, iph->check);
+	iph->tot_len = newtotlen;
+
+	DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu"
+	       "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n",
+	       NIPQUAD((*pskb)->nh.iph->saddr),
+	       ntohs(tcph->source),
+	       NIPQUAD((*pskb)->nh.iph->daddr),
+	       ntohs(tcph->dest),
+	       newmss);
+
+ retmodified:
+	/* We never hw checksum SYN packets.  */
+	BUG_ON((*pskb)->ip_summed == CHECKSUM_HW);
+
+	(*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED;
+	return IPT_CONTINUE;
+}
+
+#define TH_SYN 0x02
+
+static inline int find_syn_match(const struct ipt_entry_match *m)
+{
+	const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data;
+
+	if (strcmp(m->u.kernel.match->name, "tcp") == 0
+	    && (tcpinfo->flg_cmp & TH_SYN)
+	    && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS))
+		return 1;
+
+	return 0;
+}
+
+/* Must specify -p tcp --syn/--tcp-flags SYN */
+static int
+ipt_tcpmss_checkentry(const char *tablename,
+		      const struct ipt_entry *e,
+		      void *targinfo,
+		      unsigned int targinfosize,
+		      unsigned int hook_mask)
+{
+	const struct ipt_tcpmss_info *tcpmssinfo = targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) {
+		DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n",
+		       targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info)));
+		return 0;
+	}
+
+
+	if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && 
+			((hook_mask & ~((1 << NF_IP_FORWARD)
+			   	| (1 << NF_IP_LOCAL_OUT)
+			   	| (1 << NF_IP_POST_ROUTING))) != 0)) {
+		printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n");
+		return 0;
+	}
+
+	if (e->ip.proto == IPPROTO_TCP
+	    && !(e->ip.invflags & IPT_INV_PROTO)
+	    && IPT_MATCH_ITERATE(e, find_syn_match))
+		return 1;
+
+	printk("TCPMSS: Only works on TCP SYN packets\n");
+	return 0;
+}
+
+static struct ipt_target ipt_tcpmss_reg = {
+	.name		= "TCPMSS",
+	.target		= ipt_tcpmss_target,
+	.checkentry	= ipt_tcpmss_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_tcpmss_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_tcpmss_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c
new file mode 100644
index 000000000000..85c70d240f8b
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_TOS.c
@@ -0,0 +1,105 @@
+/* This is a module which is used for setting the TOS field of a packet. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <net/checksum.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_TOS.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables TOS mangling module");
+
+static unsigned int
+target(struct sk_buff **pskb,
+       const struct net_device *in,
+       const struct net_device *out,
+       unsigned int hooknum,
+       const void *targinfo,
+       void *userinfo)
+{
+	const struct ipt_tos_target_info *tosinfo = targinfo;
+
+	if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) {
+		u_int16_t diffs[2];
+
+		if (!skb_ip_make_writable(pskb, sizeof(struct iphdr)))
+			return NF_DROP;
+
+		diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF;
+		(*pskb)->nh.iph->tos
+			= ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK)
+			| tosinfo->tos;
+		diffs[1] = htons((*pskb)->nh.iph->tos);
+		(*pskb)->nh.iph->check
+			= csum_fold(csum_partial((char *)diffs,
+						 sizeof(diffs),
+						 (*pskb)->nh.iph->check
+						 ^0xFFFF));
+		(*pskb)->nfcache |= NFC_ALTERED;
+	}
+	return IPT_CONTINUE;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_entry *e,
+           void *targinfo,
+           unsigned int targinfosize,
+           unsigned int hook_mask)
+{
+	const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) {
+		printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n",
+		       targinfosize,
+		       IPT_ALIGN(sizeof(struct ipt_tos_target_info)));
+		return 0;
+	}
+
+	if (strcmp(tablename, "mangle") != 0) {
+		printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename);
+		return 0;
+	}
+
+	if (tos != IPTOS_LOWDELAY
+	    && tos != IPTOS_THROUGHPUT
+	    && tos != IPTOS_RELIABILITY
+	    && tos != IPTOS_MINCOST
+	    && tos != IPTOS_NORMALSVC) {
+		printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_tos_reg = {
+	.name		= "TOS",
+	.target		= target,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_target(&ipt_tos_reg);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_target(&ipt_tos_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
new file mode 100644
index 000000000000..6f2cefbe16cd
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -0,0 +1,419 @@
+/*
+ * netfilter module for userspace packet logging daemons
+ *
+ * (C) 2000-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * 2000/09/22 ulog-cprange feature added
+ * 2001/01/04 in-kernel queue as proposed by Sebastian Zander 
+ * 						<zander@fokus.gmd.de>
+ * 2001/01/30 per-rule nlgroup conflicts with global queue. 
+ *            nlgroup now global (sysctl)
+ * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at
+ * 	      module loadtime -HW
+ * 2002/07/07 remove broken nflog_rcv() function -HW
+ * 2002/08/29 fix shifted/unshifted nlgroup bug -HW
+ * 2002/10/30 fix uninitialized mac_len field - <Anders K. Pedersen>
+ * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT
+ *	      resulting in bogus 'error during NLMSG_PUT' messages.
+ *
+ * (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This module accepts two parameters: 
+ * 
+ * nlbufsiz:
+ *   The parameter specifies how big the buffer for each netlink multicast
+ * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will
+ * get accumulated in the kernel until they are sent to userspace. It is
+ * NOT possible to allocate more than 128kB, and it is strongly discouraged,
+ * because atomically allocating 128kB inside the network rx softirq is not
+ * reliable. Please also keep in mind that this buffer size is allocated for
+ * each nlgroup you are using, so the total kernel memory usage increases
+ * by that factor.
+ *
+ * flushtimeout:
+ *   Specify, after how many hundredths of a second the queue should be
+ *   flushed even if it is not full yet.
+ *
+ * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp
+ */
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/spinlock.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/netlink.h>
+#include <linux/netdevice.h>
+#include <linux/mm.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ULOG.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+#include <net/sock.h>
+#include <linux/bitops.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@gnumonks.org>");
+MODULE_DESCRIPTION("iptables userspace logging module");
+
+#define ULOG_NL_EVENT		111		/* Harald's favorite number */
+#define ULOG_MAXNLGROUPS	32		/* numer of nlgroups */
+
+#if 0
+#define DEBUGP(format, args...) printk("%s:%s:" format, \
+                                       __FILE__, __FUNCTION__ , ## args)
+#else
+#define DEBUGP(format, args...)
+#endif
+
+#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0)
+
+static unsigned int nlbufsiz = 4096;
+module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */
+MODULE_PARM_DESC(nlbufsiz, "netlink buffer size");
+
+static unsigned int flushtimeout = 10;
+module_param(flushtimeout, int, 0600);
+MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)");
+
+static unsigned int nflog = 1;
+module_param(nflog, int, 0400);
+MODULE_PARM_DESC(nflog, "register as internal netfilter logging module");
+
+/* global data structures */
+
+typedef struct {
+	unsigned int qlen;		/* number of nlmsgs' in the skb */
+	struct nlmsghdr *lastnlh;	/* netlink header of last msg in skb */
+	struct sk_buff *skb;		/* the pre-allocated skb */
+	struct timer_list timer;	/* the timer function */
+} ulog_buff_t;
+
+static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS];	/* array of buffers */
+
+static struct sock *nflognl;	/* our socket */
+static DECLARE_LOCK(ulog_lock);	/* spinlock */
+
+/* send one ulog_buff_t to userspace */
+static void ulog_send(unsigned int nlgroupnum)
+{
+	ulog_buff_t *ub = &ulog_buffers[nlgroupnum];
+
+	if (timer_pending(&ub->timer)) {
+		DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n");
+		del_timer(&ub->timer);
+	}
+
+	/* last nlmsg needs NLMSG_DONE */
+	if (ub->qlen > 1)
+		ub->lastnlh->nlmsg_type = NLMSG_DONE;
+
+	NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum);
+	DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n",
+		ub->qlen, nlgroupnum);
+	netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC);
+
+	ub->qlen = 0;
+	ub->skb = NULL;
+	ub->lastnlh = NULL;
+
+}
+
+
+/* timer function to flush queue in flushtimeout time */
+static void ulog_timer(unsigned long data)
+{
+	DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n");
+
+	/* lock to protect against somebody modifying our structure
+	 * from ipt_ulog_target at the same time */
+	LOCK_BH(&ulog_lock);
+	ulog_send(data);
+	UNLOCK_BH(&ulog_lock);
+}
+
+static struct sk_buff *ulog_alloc_skb(unsigned int size)
+{
+	struct sk_buff *skb;
+
+	/* alloc skb which should be big enough for a whole
+	 * multipart message. WARNING: has to be <= 131000
+	 * due to slab allocator restrictions */
+
+	skb = alloc_skb(nlbufsiz, GFP_ATOMIC);
+	if (!skb) {
+		PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n",
+			nlbufsiz);
+
+		/* try to allocate only as much as we need for 
+		 * current packet */
+
+		skb = alloc_skb(size, GFP_ATOMIC);
+		if (!skb)
+			PRINTR("ipt_ULOG: can't even allocate %ub\n", size);
+	}
+
+	return skb;
+}
+
+static void ipt_ulog_packet(unsigned int hooknum,
+			    const struct sk_buff *skb,
+			    const struct net_device *in,
+			    const struct net_device *out,
+			    const struct ipt_ulog_info *loginfo,
+			    const char *prefix)
+{
+	ulog_buff_t *ub;
+	ulog_packet_msg_t *pm;
+	size_t size, copy_len;
+	struct nlmsghdr *nlh;
+
+	/* ffs == find first bit set, necessary because userspace
+	 * is already shifting groupnumber, but we need unshifted.
+	 * ffs() returns [1..32], we need [0..31] */
+	unsigned int groupnum = ffs(loginfo->nl_group) - 1;
+
+	/* calculate the size of the skb needed */
+	if ((loginfo->copy_range == 0) ||
+	    (loginfo->copy_range > skb->len)) {
+		copy_len = skb->len;
+	} else {
+		copy_len = loginfo->copy_range;
+	}
+
+	size = NLMSG_SPACE(sizeof(*pm) + copy_len);
+
+	ub = &ulog_buffers[groupnum];
+	
+	LOCK_BH(&ulog_lock);
+
+	if (!ub->skb) {
+		if (!(ub->skb = ulog_alloc_skb(size)))
+			goto alloc_failure;
+	} else if (ub->qlen >= loginfo->qthreshold ||
+		   size > skb_tailroom(ub->skb)) {
+		/* either the queue len is too high or we don't have 
+		 * enough room in nlskb left. send it to userspace. */
+
+		ulog_send(groupnum);
+
+		if (!(ub->skb = ulog_alloc_skb(size)))
+			goto alloc_failure;
+	}
+
+	DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen, 
+		loginfo->qthreshold);
+
+	/* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */
+	nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, 
+			sizeof(*pm)+copy_len);
+	ub->qlen++;
+
+	pm = NLMSG_DATA(nlh);
+
+	/* We might not have a timestamp, get one */
+	if (skb->stamp.tv_sec == 0)
+		do_gettimeofday((struct timeval *)&skb->stamp);
+
+	/* copy hook, prefix, timestamp, payload, etc. */
+	pm->data_len = copy_len;
+	pm->timestamp_sec = skb->stamp.tv_sec;
+	pm->timestamp_usec = skb->stamp.tv_usec;
+	pm->mark = skb->nfmark;
+	pm->hook = hooknum;
+	if (prefix != NULL)
+		strncpy(pm->prefix, prefix, sizeof(pm->prefix));
+	else if (loginfo->prefix[0] != '\0')
+		strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix));
+	else
+		*(pm->prefix) = '\0';
+
+	if (in && in->hard_header_len > 0
+	    && skb->mac.raw != (void *) skb->nh.iph
+	    && in->hard_header_len <= ULOG_MAC_LEN) {
+		memcpy(pm->mac, skb->mac.raw, in->hard_header_len);
+		pm->mac_len = in->hard_header_len;
+	} else
+		pm->mac_len = 0;
+
+	if (in)
+		strncpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+	else
+		pm->indev_name[0] = '\0';
+
+	if (out)
+		strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+	else
+		pm->outdev_name[0] = '\0';
+
+	/* copy_len <= skb->len, so can't fail. */
+	if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0)
+		BUG();
+	
+	/* check if we are building multi-part messages */
+	if (ub->qlen > 1) {
+		ub->lastnlh->nlmsg_flags |= NLM_F_MULTI;
+	}
+
+	ub->lastnlh = nlh;
+
+	/* if timer isn't already running, start it */
+	if (!timer_pending(&ub->timer)) {
+		ub->timer.expires = jiffies + flushtimeout * HZ / 100;
+		add_timer(&ub->timer);
+	}
+
+	/* if threshold is reached, send message to userspace */
+	if (ub->qlen >= loginfo->qthreshold) {
+		if (loginfo->qthreshold > 1)
+			nlh->nlmsg_type = NLMSG_DONE;
+		ulog_send(groupnum);
+	}
+
+	UNLOCK_BH(&ulog_lock);
+
+	return;
+
+nlmsg_failure:
+	PRINTR("ipt_ULOG: error during NLMSG_PUT\n");
+
+alloc_failure:
+	PRINTR("ipt_ULOG: Error building netlink message\n");
+
+	UNLOCK_BH(&ulog_lock);
+}
+
+static unsigned int ipt_ulog_target(struct sk_buff **pskb,
+				    const struct net_device *in,
+				    const struct net_device *out,
+				    unsigned int hooknum,
+				    const void *targinfo, void *userinfo)
+{
+	struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
+
+	ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL);
+ 
+ 	return IPT_CONTINUE;
+}
+ 
+static void ipt_logfn(unsigned int hooknum,
+		      const struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const char *prefix)
+{
+	struct ipt_ulog_info loginfo = { 
+		.nl_group = ULOG_DEFAULT_NLGROUP,
+		.copy_range = 0,
+		.qthreshold = ULOG_DEFAULT_QTHRESHOLD,
+		.prefix = ""
+	};
+
+	ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix);
+}
+
+static int ipt_ulog_checkentry(const char *tablename,
+			       const struct ipt_entry *e,
+			       void *targinfo,
+			       unsigned int targinfosize,
+			       unsigned int hookmask)
+{
+	struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo;
+
+	if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) {
+		DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize);
+		return 0;
+	}
+
+	if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') {
+		DEBUGP("ipt_ULOG: prefix term %i\n",
+		       loginfo->prefix[sizeof(loginfo->prefix) - 1]);
+		return 0;
+	}
+
+	if (loginfo->qthreshold > ULOG_MAX_QLEN) {
+		DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n",
+			loginfo->qthreshold);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_target ipt_ulog_reg = {
+	.name		= "ULOG",
+	.target		= ipt_ulog_target,
+	.checkentry	= ipt_ulog_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int i;
+
+	DEBUGP("ipt_ULOG: init module\n");
+
+	if (nlbufsiz >= 128*1024) {
+		printk("Netlink buffer has to be <= 128kB\n");
+		return -EINVAL;
+	}
+
+	/* initialize ulog_buffers */
+	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+		init_timer(&ulog_buffers[i].timer);
+		ulog_buffers[i].timer.function = ulog_timer;
+		ulog_buffers[i].timer.data = i;
+	}
+
+	nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL);
+	if (!nflognl)
+		return -ENOMEM;
+
+	if (ipt_register_target(&ipt_ulog_reg) != 0) {
+		sock_release(nflognl->sk_socket);
+		return -EINVAL;
+	}
+	if (nflog)
+		nf_log_register(PF_INET, &ipt_logfn);
+	
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	ulog_buff_t *ub;
+	int i;
+
+	DEBUGP("ipt_ULOG: cleanup_module\n");
+
+	if (nflog)
+		nf_log_unregister(PF_INET, &ipt_logfn);
+	ipt_unregister_target(&ipt_ulog_reg);
+	sock_release(nflognl->sk_socket);
+
+	/* remove pending timers and free allocated skb's */
+	for (i = 0; i < ULOG_MAXNLGROUPS; i++) {
+		ub = &ulog_buffers[i];
+		if (timer_pending(&ub->timer)) {
+			DEBUGP("timer was pending, deleting\n");
+			del_timer(&ub->timer);
+		}
+
+		if (ub->skb) {
+			kfree_skb(ub->skb);
+			ub->skb = NULL;
+		}
+	}
+
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c
new file mode 100644
index 000000000000..f5909a4c3fc7
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_addrtype.c
@@ -0,0 +1,77 @@
+/*
+ *  iptables module to match inet_addr_type() of an ip.
+ *
+ *  Copyright (c) 2004 Patrick McHardy <kaber@trash.net>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/ip.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4/ipt_addrtype.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("iptables addrtype match");
+
+static inline int match_type(u_int32_t addr, u_int16_t mask)
+{
+	return !!(mask & (1 << inet_addr_type(addr)));
+}
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+		 const struct net_device *out, const void *matchinfo,
+		 int offset, int *hotdrop)
+{
+	const struct ipt_addrtype_info *info = matchinfo;
+	const struct iphdr *iph = skb->nh.iph;
+	int ret = 1;
+
+	if (info->source)
+		ret &= match_type(iph->saddr, info->source)^info->invert_source;
+	if (info->dest)
+		ret &= match_type(iph->daddr, info->dest)^info->invert_dest;
+	
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+		      void *matchinfo, unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) {
+		printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.",
+		       matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info)));
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match addrtype_match = {
+	.name		= "addrtype",
+	.match		= match,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&addrtype_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&addrtype_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c
new file mode 100644
index 000000000000..a0fea847cb72
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ah.c
@@ -0,0 +1,117 @@
+/* Kernel module to match AH parameters. */
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_ah.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("iptables AH SPI match module");
+
+#ifdef DEBUG_CONNTRACK
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline int
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+{
+	int r=0;
+        duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
+        	min,spi,max);
+	r=(spi >= min && spi <= max) ^ invert;
+	duprintf(" result %s\n",r? "PASS" : "FAILED");
+	return r;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	struct ip_auth_hdr _ahdr, *ah;
+	const struct ipt_ah *ahinfo = matchinfo;
+
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				sizeof(_ahdr), &_ahdr);
+	if (ah == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("Dropping evil AH tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return spi_match(ahinfo->spis[0], ahinfo->spis[1],
+			 ntohl(ah->spi),
+			 !!(ahinfo->invflags & IPT_AH_INV_SPI));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchinfosize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_ah *ahinfo = matchinfo;
+
+	/* Must specify proto == AH, and no unknown invflags */
+	if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) {
+		duprintf("ipt_ah: Protocol %u != %u\n", ip->proto,
+			 IPPROTO_AH);
+		return 0;
+	}
+	if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) {
+		duprintf("ipt_ah: matchsize %u != %u\n",
+			 matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah)));
+		return 0;
+	}
+	if (ahinfo->invflags & ~IPT_AH_INV_MASK) {
+		duprintf("ipt_ah: unknown flags %X\n",
+			 ahinfo->invflags);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match ah_match = {
+	.name		= "ah",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&ah_match);
+}
+
+static void __exit cleanup(void)
+{
+	ipt_unregister_match(&ah_match);
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c
new file mode 100644
index 000000000000..6b76a1ea5245
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_comment.c
@@ -0,0 +1,59 @@
+/*
+ * Implements a dummy match to allow attaching comments to rules
+ *
+ * 2003-05-13 Brad Fisher (brad@info-link.net)
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_comment.h>
+
+MODULE_AUTHOR("Brad Fisher <brad@info-link.net>");
+MODULE_DESCRIPTION("iptables comment match module");
+MODULE_LICENSE("GPL");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	/* We always match */
+	return 1;
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	/* Check the size */
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info)))
+		return 0;
+	return 1;
+}
+
+static struct ipt_match comment_match = {
+	.name		= "comment",
+	.match		= match,
+	.checkentry	= checkentry,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&comment_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&comment_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c
new file mode 100644
index 000000000000..2706f96cea55
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_connmark.c
@@ -0,0 +1,81 @@
+/* This kernel module matches connection mark values set by the
+ * CONNMARK target
+ *
+ * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com>
+ * by Henrik Nordstrom <hno@marasystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+MODULE_AUTHOR("Henrik Nordstrom <hno@marasytems.com>");
+MODULE_DESCRIPTION("IP tables connmark match module");
+MODULE_LICENSE("GPL");
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_connmark.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_connmark_info *info = matchinfo;
+	enum ip_conntrack_info ctinfo;
+	struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+	if (!ct)
+		return 0;
+
+	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match connmark_match = {
+	.name = "connmark",
+	.match = &match,
+	.checkentry = &checkentry,
+	.me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&connmark_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&connmark_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c
new file mode 100644
index 000000000000..c1d22801b7cf
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_conntrack.c
@@ -0,0 +1,136 @@
+/* Kernel module to match connection tracking information.
+ * Superset of Rusty's minimalistic state match.
+ *
+ * (C) 2001  Marc Boucher (marc@mbsi.ca).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables connection tracking match module");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_conntrack_info *sinfo = matchinfo;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+
+#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg))
+
+	if (ct == &ip_conntrack_untracked)
+		statebit = IPT_CONNTRACK_STATE_UNTRACKED;
+	else if (ct)
+ 		statebit = IPT_CONNTRACK_STATE_BIT(ctinfo);
+ 	else
+ 		statebit = IPT_CONNTRACK_STATE_INVALID;
+ 
+	if(sinfo->flags & IPT_CONNTRACK_STATE) {
+		if (ct) {
+			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip !=
+			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip)
+				statebit |= IPT_CONNTRACK_STATE_SNAT;
+
+			if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip !=
+			    ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip)
+				statebit |= IPT_CONNTRACK_STATE_DNAT;
+		}
+
+		if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE))
+			return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_PROTO) {
+		if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO))
+                	return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC))
+			return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_ORIGDST) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST))
+			return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_REPLSRC) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC))
+			return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_REPLDST) {
+		if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST))
+			return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_STATUS) {
+		if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS))
+			return 0;
+	}
+
+	if(sinfo->flags & IPT_CONNTRACK_EXPIRES) {
+		unsigned long expires;
+
+		if(!ct)
+			return 0;
+
+		expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0;
+
+		if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int check(const char *tablename,
+		 const struct ipt_ip *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match conntrack_match = {
+	.name		= "conntrack",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	need_ip_conntrack();
+	return ipt_register_match(&conntrack_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&conntrack_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c
new file mode 100644
index 000000000000..5df52a64a5d4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_dscp.c
@@ -0,0 +1,63 @@
+/* IP tables module for matching the value of the IPv4 DSCP field
+ *
+ * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_dscp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables DSCP matching module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+		 const struct net_device *out, const void *matchinfo,
+		 int offset, int *hotdrop)
+{
+	const struct ipt_dscp_info *info = matchinfo;
+	const struct iphdr *iph = skb->nh.iph;
+
+	u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK);
+
+	return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+		      void *matchinfo, unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match dscp_match = {
+	.name		= "dscp",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&dscp_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&dscp_match);
+
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
new file mode 100644
index 000000000000..b6f7181e89cc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -0,0 +1,131 @@
+/* IP tables module for matching the value of the IPv4 and TCP ECN bits
+ *
+ * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp
+ *
+ * (C) 2002 by Harald Welte <laforge@gnumonks.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/tcp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_ecn.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables ECN matching module");
+MODULE_LICENSE("GPL");
+
+static inline int match_ip(const struct sk_buff *skb,
+			   const struct ipt_ecn_info *einfo)
+{
+	return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect);
+}
+
+static inline int match_tcp(const struct sk_buff *skb,
+			    const struct ipt_ecn_info *einfo,
+			    int *hotdrop)
+{
+	struct tcphdr _tcph, *th;
+
+	/* In practice, TCP match does this, so can't fail.  But let's
+	 * be good citizens.
+	 */
+	th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL) {
+		*hotdrop = 0;
+		return 0;
+	}
+
+	if (einfo->operation & IPT_ECN_OP_MATCH_ECE) {
+		if (einfo->invert & IPT_ECN_OP_MATCH_ECE) {
+			if (th->ece == 1)
+				return 0;
+		} else {
+			if (th->ece == 0)
+				return 0;
+		}
+	}
+
+	if (einfo->operation & IPT_ECN_OP_MATCH_CWR) {
+		if (einfo->invert & IPT_ECN_OP_MATCH_CWR) {
+			if (th->cwr == 1)
+				return 0;
+		} else {
+			if (th->cwr == 0)
+				return 0;
+		}
+	}
+
+	return 1;
+}
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+		 const struct net_device *out, const void *matchinfo,
+		 int offset, int *hotdrop)
+{
+	const struct ipt_ecn_info *info = matchinfo;
+
+	if (info->operation & IPT_ECN_OP_MATCH_IP)
+		if (!match_ip(skb, info))
+			return 0;
+
+	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) {
+		if (skb->nh.iph->protocol != IPPROTO_TCP)
+			return 0;
+		if (!match_tcp(skb, info, hotdrop))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+		      void *matchinfo, unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	const struct ipt_ecn_info *info = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info)))
+		return 0;
+
+	if (info->operation & IPT_ECN_OP_MATCH_MASK)
+		return 0;
+
+	if (info->invert & IPT_ECN_OP_MATCH_MASK)
+		return 0;
+
+	if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
+	    && ip->proto != IPPROTO_TCP) {
+		printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
+		       " non-tcp packets\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match ecn_match = {
+	.name		= "ecn",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&ecn_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&ecn_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c
new file mode 100644
index 000000000000..e1d0dd31e117
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_esp.c
@@ -0,0 +1,118 @@
+/* Kernel module to match ESP parameters. */
+
+/* (C) 1999-2000 Yon Uriarte <yon@astaro.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+
+#include <linux/netfilter_ipv4/ipt_esp.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Yon Uriarte <yon@astaro.de>");
+MODULE_DESCRIPTION("iptables ESP SPI match module");
+
+#ifdef DEBUG_CONNTRACK
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Returns 1 if the spi is matched by the range, 0 otherwise */
+static inline int
+spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert)
+{
+	int r=0;
+        duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ',
+        	min,spi,max);
+	r=(spi >= min && spi <= max) ^ invert;
+	duprintf(" result %s\n",r? "PASS" : "FAILED");
+	return r;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	struct ip_esp_hdr _esp, *eh;
+	const struct ipt_esp *espinfo = matchinfo;
+
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				sizeof(_esp), &_esp);
+	if (eh == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("Dropping evil ESP tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return spi_match(espinfo->spis[0], espinfo->spis[1],
+			 ntohl(eh->spi),
+			 !!(espinfo->invflags & IPT_ESP_INV_SPI));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchinfosize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_esp *espinfo = matchinfo;
+
+	/* Must specify proto == ESP, and no unknown invflags */
+	if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) {
+		duprintf("ipt_esp: Protocol %u != %u\n", ip->proto,
+			 IPPROTO_ESP);
+		return 0;
+	}
+	if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) {
+		duprintf("ipt_esp: matchsize %u != %u\n",
+			 matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp)));
+		return 0;
+	}
+	if (espinfo->invflags & ~IPT_ESP_INV_MASK) {
+		duprintf("ipt_esp: unknown flags %X\n",
+			 espinfo->invflags);
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match esp_match = {
+	.name		= "esp",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&esp_match);
+}
+
+static void __exit cleanup(void)
+{
+	ipt_unregister_match(&esp_match);
+}
+
+module_init(init);
+module_exit(cleanup);
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c
new file mode 100644
index 000000000000..f1937190cd77
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_hashlimit.c
@@ -0,0 +1,731 @@
+/* iptables match extension to limit the number of packets per second
+ * seperately for each hashbucket (sourceip/sourceport/dstip/dstport)
+ *
+ * (C) 2003-2004 by Harald Welte <laforge@netfilter.org>
+ *
+ * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $
+ *
+ * Development of this code was funded by Astaro AG, http://www.astaro.com/
+ *
+ * based on ipt_limit.c by:
+ * J�r�me de Vivie	<devivie@info.enserb.u-bordeaux.fr>
+ * Herv� Eychenne	<eychenne@info.enserb.u-bordeaux.fr>
+ * Rusty Russell	<rusty@rustcorp.com.au>
+ *
+ * The general idea is to create a hash table for every dstip and have a
+ * seperate limit counter per tuple.  This way you can do something like 'limit
+ * the number of syn packets for each of my internal addresses.
+ *
+ * Ideally this would just be implemented as a general 'hash' match, which would
+ * allow us to attach any iptables target to it's hash buckets.  But this is
+ * not possible in the current iptables architecture.  As always, pkttables for
+ * 2.7.x will help ;)
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_hashlimit.h>
+#include <linux/netfilter_ipv4/lockhelp.h>
+
+/* FIXME: this is just for IP_NF_ASSERRT */
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("iptables match for limiting per hash-bucket");
+
+/* need to declare this at the top */
+static struct proc_dir_entry *hashlimit_procdir;
+static struct file_operations dl_file_ops;
+
+/* hash table crap */
+
+struct dsthash_dst {
+	u_int32_t src_ip;
+	u_int32_t dst_ip;
+	/* ports have to be consecutive !!! */
+	u_int16_t src_port;
+	u_int16_t dst_port;
+};
+
+struct dsthash_ent {
+	/* static / read-only parts in the beginning */
+	struct hlist_node node;
+	struct dsthash_dst dst;
+
+	/* modified structure members in the end */
+	unsigned long expires;		/* precalculated expiry time */
+	struct {
+		unsigned long prev;	/* last modification */
+		u_int32_t credit;
+		u_int32_t credit_cap, cost;
+	} rateinfo;
+};
+
+struct ipt_hashlimit_htable {
+	struct hlist_node node;		/* global list of all htables */
+	atomic_t use;
+
+	struct hashlimit_cfg cfg;	/* config */
+
+	/* used internally */
+	spinlock_t lock;		/* lock for list_head */
+	u_int32_t rnd;			/* random seed for hash */
+	struct timer_list timer;	/* timer for gc */
+	atomic_t count;			/* number entries in table */
+
+	/* seq_file stuff */
+	struct proc_dir_entry *pde;
+
+	struct hlist_head hash[0];	/* hashtable itself */
+};
+
+static DECLARE_LOCK(hashlimit_lock);	/* protects htables list */
+static DECLARE_MUTEX(hlimit_mutex);	/* additional checkentry protection */
+static HLIST_HEAD(hashlimit_htables);
+static kmem_cache_t *hashlimit_cachep;
+
+static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b)
+{
+	return (ent->dst.dst_ip == b->dst_ip 
+		&& ent->dst.dst_port == b->dst_port
+		&& ent->dst.src_port == b->src_port
+		&& ent->dst.src_ip == b->src_ip);
+}
+
+static inline u_int32_t
+hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst)
+{
+	return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port), 
+			     dst->src_ip, ht->rnd) % ht->cfg.size);
+}
+
+static inline struct dsthash_ent *
+__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
+{
+	struct dsthash_ent *ent;
+	struct hlist_node *pos;
+	u_int32_t hash = hash_dst(ht, dst);
+
+	if (!hlist_empty(&ht->hash[hash]))
+		hlist_for_each_entry(ent, pos, &ht->hash[hash], node) {
+			if (dst_cmp(ent, dst)) {
+				return ent;
+			}
+		}
+	
+	return NULL;
+}
+
+/* allocate dsthash_ent, initialize dst, put in htable and lock it */
+static struct dsthash_ent *
+__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst)
+{
+	struct dsthash_ent *ent;
+
+	/* initialize hash with random val at the time we allocate
+	 * the first hashtable entry */
+	if (!ht->rnd)
+		get_random_bytes(&ht->rnd, 4);
+
+	if (ht->cfg.max &&
+	    atomic_read(&ht->count) >= ht->cfg.max) {
+		/* FIXME: do something. question is what.. */
+		if (net_ratelimit())
+			printk(KERN_WARNING 
+				"ipt_hashlimit: max count of %u reached\n", 
+				ht->cfg.max);
+		return NULL;
+	}
+
+	ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+	if (!ent) {
+		if (net_ratelimit())
+			printk(KERN_ERR 
+				"ipt_hashlimit: can't allocate dsthash_ent\n");
+		return NULL;
+	}
+
+	atomic_inc(&ht->count);
+
+	ent->dst.dst_ip = dst->dst_ip;
+	ent->dst.dst_port = dst->dst_port;
+	ent->dst.src_ip = dst->src_ip;
+	ent->dst.src_port = dst->src_port;
+
+	hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+
+	return ent;
+}
+
+static inline void 
+__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent)
+{
+	hlist_del(&ent->node);
+	kmem_cache_free(hashlimit_cachep, ent);
+	atomic_dec(&ht->count);
+}
+static void htable_gc(unsigned long htlong);
+
+static int htable_create(struct ipt_hashlimit_info *minfo)
+{
+	int i;
+	unsigned int size;
+	struct ipt_hashlimit_htable *hinfo;
+
+	if (minfo->cfg.size)
+		size = minfo->cfg.size;
+	else {
+		size = (((num_physpages << PAGE_SHIFT) / 16384)
+			 / sizeof(struct list_head));
+		if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
+			size = 8192;
+		if (size < 16)
+			size = 16;
+	}
+	/* FIXME: don't use vmalloc() here or anywhere else -HW */
+	hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable)
+			+ (sizeof(struct list_head) * size));
+	if (!hinfo) {
+		printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n");
+		return -1;
+	}
+	minfo->hinfo = hinfo;
+
+	/* copy match config into hashtable config */
+	memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg));
+	hinfo->cfg.size = size;
+	if (!hinfo->cfg.max)
+		hinfo->cfg.max = 8 * hinfo->cfg.size;
+	else if (hinfo->cfg.max < hinfo->cfg.size)
+		hinfo->cfg.max = hinfo->cfg.size;
+
+	for (i = 0; i < hinfo->cfg.size; i++)
+		INIT_HLIST_HEAD(&hinfo->hash[i]);
+
+	atomic_set(&hinfo->count, 0);
+	atomic_set(&hinfo->use, 1);
+	hinfo->rnd = 0;
+	spin_lock_init(&hinfo->lock);
+	hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir);
+	if (!hinfo->pde) {
+		vfree(hinfo);
+		return -1;
+	}
+	hinfo->pde->proc_fops = &dl_file_ops;
+	hinfo->pde->data = hinfo;
+
+	init_timer(&hinfo->timer);
+	hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval);
+	hinfo->timer.data = (unsigned long )hinfo;
+	hinfo->timer.function = htable_gc;
+	add_timer(&hinfo->timer);
+
+	LOCK_BH(&hashlimit_lock);
+	hlist_add_head(&hinfo->node, &hashlimit_htables);
+	UNLOCK_BH(&hashlimit_lock);
+
+	return 0;
+}
+
+static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
+{
+	return 1;
+}
+
+static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he)
+{
+	return (jiffies >= he->expires);
+}
+
+static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht,
+		 		int (*select)(struct ipt_hashlimit_htable *ht, 
+					      struct dsthash_ent *he))
+{
+	int i;
+
+	IP_NF_ASSERT(ht->cfg.size && ht->cfg.max);
+
+	/* lock hash table and iterate over it */
+	spin_lock_bh(&ht->lock);
+	for (i = 0; i < ht->cfg.size; i++) {
+		struct dsthash_ent *dh;
+		struct hlist_node *pos, *n;
+		hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) {
+			if ((*select)(ht, dh))
+				__dsthash_free(ht, dh);
+		}
+	}
+	spin_unlock_bh(&ht->lock);
+}
+
+/* hash table garbage collector, run by timer */
+static void htable_gc(unsigned long htlong)
+{
+	struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong;
+
+	htable_selective_cleanup(ht, select_gc);
+
+	/* re-add the timer accordingly */
+	ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval);
+	add_timer(&ht->timer);
+}
+
+static void htable_destroy(struct ipt_hashlimit_htable *hinfo)
+{
+	/* remove timer, if it is pending */
+	if (timer_pending(&hinfo->timer))
+		del_timer(&hinfo->timer);
+
+	/* remove proc entry */
+	remove_proc_entry(hinfo->pde->name, hashlimit_procdir);
+
+	htable_selective_cleanup(hinfo, select_all);
+	vfree(hinfo);
+}
+
+static struct ipt_hashlimit_htable *htable_find_get(char *name)
+{
+	struct ipt_hashlimit_htable *hinfo;
+	struct hlist_node *pos;
+
+	LOCK_BH(&hashlimit_lock);
+	hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) {
+		if (!strcmp(name, hinfo->pde->name)) {
+			atomic_inc(&hinfo->use);
+			UNLOCK_BH(&hashlimit_lock);
+			return hinfo;
+		}
+	}
+	UNLOCK_BH(&hashlimit_lock);
+
+	return NULL;
+}
+
+static void htable_put(struct ipt_hashlimit_htable *hinfo)
+{
+	if (atomic_dec_and_test(&hinfo->use)) {
+		LOCK_BH(&hashlimit_lock);
+		hlist_del(&hinfo->node);
+		UNLOCK_BH(&hashlimit_lock);
+		htable_destroy(hinfo);
+	}
+}
+
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maximum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie.
+*/
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+/* Precision saver. */
+static inline u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE;
+}
+
+static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now)
+{
+	dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now)) 
+					* CREDITS_PER_JIFFY;
+	if (dh->rateinfo.credit > dh->rateinfo.credit_cap)
+		dh->rateinfo.credit = dh->rateinfo.credit_cap;
+}
+
+static inline int get_ports(const struct sk_buff *skb, int offset, 
+			    u16 ports[2])
+{
+	union {
+		struct tcphdr th;
+		struct udphdr uh;
+		sctp_sctphdr_t sctph;
+	} hdr_u, *ptr_u;
+
+	/* Must not be a fragment. */
+	if (offset)
+		return 1;
+
+	/* Must be big enough to read ports (both UDP and TCP have
+	   them at the start). */
+	ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); 
+	if (!ptr_u)
+		return 1;
+
+	switch (skb->nh.iph->protocol) {
+		case IPPROTO_TCP:
+			ports[0] = ptr_u->th.source;
+			ports[1] = ptr_u->th.dest;
+			break;
+		case IPPROTO_UDP:
+			ports[0] = ptr_u->uh.source;
+			ports[1] = ptr_u->uh.dest;
+			break;
+		case IPPROTO_SCTP:
+			ports[0] = ptr_u->sctph.source;
+			ports[1] = ptr_u->sctph.dest;
+			break;
+		default:
+			/* all other protocols don't supprot per-port hash
+			 * buckets */
+			ports[0] = ports[1] = 0;
+			break;
+	}
+
+	return 0;
+}
+
+
+static int
+hashlimit_match(const struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		const void *matchinfo,
+		int offset,
+		int *hotdrop)
+{
+	struct ipt_hashlimit_info *r = 
+		((struct ipt_hashlimit_info *)matchinfo)->u.master;
+	struct ipt_hashlimit_htable *hinfo = r->hinfo;
+	unsigned long now = jiffies;
+	struct dsthash_ent *dh;
+	struct dsthash_dst dst;
+
+	/* build 'dst' according to hinfo->cfg and current packet */
+	memset(&dst, 0, sizeof(dst));
+	if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP)
+		dst.dst_ip = skb->nh.iph->daddr;
+	if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP)
+		dst.src_ip = skb->nh.iph->saddr;
+	if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT
+	    ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) {
+		u_int16_t ports[2];
+		if (get_ports(skb, offset, ports)) {
+			/* We've been asked to examine this packet, and we
+		 	  can't.  Hence, no choice but to drop. */
+			*hotdrop = 1;
+			return 0;
+		}
+		if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT)
+			dst.src_port = ports[0];
+		if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT)
+			dst.dst_port = ports[1];
+	} 
+
+	spin_lock_bh(&hinfo->lock);
+	dh = __dsthash_find(hinfo, &dst);
+	if (!dh) {
+		dh = __dsthash_alloc_init(hinfo, &dst);
+
+		if (!dh) {
+			/* enomem... don't match == DROP */
+			if (net_ratelimit())
+				printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__);
+			spin_unlock_bh(&hinfo->lock);
+			return 0;
+		}
+
+		dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
+
+		dh->rateinfo.prev = jiffies;
+		dh->rateinfo.credit = user2credits(hinfo->cfg.avg * 
+							hinfo->cfg.burst);
+		dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * 
+							hinfo->cfg.burst);
+		dh->rateinfo.cost = user2credits(hinfo->cfg.avg);
+
+		spin_unlock_bh(&hinfo->lock);
+		return 1;
+	}
+
+	/* update expiration timeout */
+	dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire);
+
+	rateinfo_recalc(dh, now);
+	if (dh->rateinfo.credit >= dh->rateinfo.cost) {
+		/* We're underlimit. */
+		dh->rateinfo.credit -= dh->rateinfo.cost;
+		spin_unlock_bh(&hinfo->lock);
+		return 1;
+	}
+
+       	spin_unlock_bh(&hinfo->lock);
+
+	/* default case: we're overlimit, thus don't match */
+	return 0;
+}
+
+static int
+hashlimit_checkentry(const char *tablename,
+		     const struct ipt_ip *ip,
+		     void *matchinfo,
+		     unsigned int matchsize,
+		     unsigned int hook_mask)
+{
+	struct ipt_hashlimit_info *r = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info)))
+		return 0;
+
+	/* Check for overflow. */
+	if (r->cfg.burst == 0
+	    || user2credits(r->cfg.avg * r->cfg.burst) < 
+	    				user2credits(r->cfg.avg)) {
+		printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n",
+		       r->cfg.avg, r->cfg.burst);
+		return 0;
+	}
+
+	if (r->cfg.mode == 0 
+	    || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT
+		          |IPT_HASHLIMIT_HASH_DIP
+			  |IPT_HASHLIMIT_HASH_SIP
+			  |IPT_HASHLIMIT_HASH_SPT))
+		return 0;
+
+	if (!r->cfg.gc_interval)
+		return 0;
+	
+	if (!r->cfg.expire)
+		return 0;
+
+	/* This is the best we've got: We cannot release and re-grab lock,
+	 * since checkentry() is called before ip_tables.c grabs ipt_mutex.  
+	 * We also cannot grab the hashtable spinlock, since htable_create will 
+	 * call vmalloc, and that can sleep.  And we cannot just re-search
+	 * the list of htable's in htable_create(), since then we would
+	 * create duplicate proc files. -HW */
+	down(&hlimit_mutex);
+	r->hinfo = htable_find_get(r->name);
+	if (!r->hinfo && (htable_create(r) != 0)) {
+		up(&hlimit_mutex);
+		return 0;
+	}
+	up(&hlimit_mutex);
+
+	/* Ugly hack: For SMP, we only want to use one set */
+	r->u.master = r;
+
+	return 1;
+}
+
+static void
+hashlimit_destroy(void *matchinfo, unsigned int matchsize)
+{
+	struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo;
+
+	htable_put(r->hinfo);
+}
+
+static struct ipt_match ipt_hashlimit = { 
+	.name = "hashlimit", 
+	.match = hashlimit_match, 
+	.checkentry = hashlimit_checkentry, 
+	.destroy = hashlimit_destroy,
+	.me = THIS_MODULE 
+};
+
+/* PROC stuff */
+
+static void *dl_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct ipt_hashlimit_htable *htable = pde->data;
+	unsigned int *bucket;
+
+	spin_lock_bh(&htable->lock);
+	if (*pos >= htable->cfg.size)
+		return NULL;
+
+	bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC);
+	if (!bucket)
+		return ERR_PTR(-ENOMEM);
+
+	*bucket = *pos;
+	return bucket;
+}
+
+static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct ipt_hashlimit_htable *htable = pde->data;
+	unsigned int *bucket = (unsigned int *)v;
+
+	*pos = ++(*bucket);
+	if (*pos >= htable->cfg.size) {
+		kfree(v);
+		return NULL;
+	}
+	return bucket;
+}
+
+static void dl_seq_stop(struct seq_file *s, void *v)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct ipt_hashlimit_htable *htable = pde->data;
+	unsigned int *bucket = (unsigned int *)v;
+
+	kfree(bucket);
+
+	spin_unlock_bh(&htable->lock);
+}
+
+static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s)
+{
+	/* recalculate to show accurate numbers */
+	rateinfo_recalc(ent, jiffies);
+
+	return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n",
+			(long)(ent->expires - jiffies)/HZ,
+			NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port),
+			NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port),
+			ent->rateinfo.credit, ent->rateinfo.credit_cap,
+			ent->rateinfo.cost);
+}
+
+static int dl_seq_show(struct seq_file *s, void *v)
+{
+	struct proc_dir_entry *pde = s->private;
+	struct ipt_hashlimit_htable *htable = pde->data;
+	unsigned int *bucket = (unsigned int *)v;
+	struct dsthash_ent *ent;
+	struct hlist_node *pos;
+
+	if (!hlist_empty(&htable->hash[*bucket]))
+		hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) {
+			if (dl_seq_real_show(ent, s)) {
+				/* buffer was filled and unable to print that tuple */
+				return 1;
+			}
+		}
+	
+	return 0;
+}
+
+static struct seq_operations dl_seq_ops = {
+	.start = dl_seq_start,
+	.next  = dl_seq_next,
+	.stop  = dl_seq_stop,
+	.show  = dl_seq_show
+};
+
+static int dl_proc_open(struct inode *inode, struct file *file)
+{
+	int ret = seq_open(file, &dl_seq_ops);
+
+	if (!ret) {
+		struct seq_file *sf = file->private_data;
+		sf->private = PDE(inode);
+	}
+	return ret;
+}
+
+static struct file_operations dl_file_ops = {
+	.owner   = THIS_MODULE,
+	.open    = dl_proc_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static int init_or_fini(int fini)
+{
+	int ret = 0;
+
+	if (fini)
+		goto cleanup;
+
+	if (ipt_register_match(&ipt_hashlimit)) {
+		ret = -EINVAL;
+		goto cleanup_nothing;
+	}
+
+	hashlimit_cachep = kmem_cache_create("ipt_hashlimit",
+					    sizeof(struct dsthash_ent), 0,
+					    0, NULL, NULL);
+	if (!hashlimit_cachep) {
+		printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n");
+		ret = -ENOMEM;
+		goto cleanup_unreg_match;
+	}
+
+	hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net);
+	if (!hashlimit_procdir) {
+		printk(KERN_ERR "Unable to create proc dir entry\n");
+		ret = -ENOMEM;
+		goto cleanup_free_slab;
+	}
+
+	return ret;
+
+cleanup:
+	remove_proc_entry("ipt_hashlimit", proc_net);
+cleanup_free_slab:
+	kmem_cache_destroy(hashlimit_cachep);
+cleanup_unreg_match:
+	ipt_unregister_match(&ipt_hashlimit);
+cleanup_nothing:
+	return ret;
+	
+}
+
+static int __init init(void)
+{
+	return init_or_fini(0);
+}
+
+static void __exit fini(void)
+{
+	init_or_fini(1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c
new file mode 100644
index 000000000000..33fdf364d3d3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_helper.c
@@ -0,0 +1,113 @@
+/* iptables module to match on related connections */
+/*
+ * (C) 2001 Martin Josefsson <gandalf@wlug.westbo.se>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *   19 Mar 2002 Harald Welte <laforge@gnumonks.org>:
+ *   		 - Port to newnat infrastructure
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_conntrack_core.h>
+#include <linux/netfilter_ipv4/ip_conntrack_helper.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_helper.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Josefsson <gandalf@netfilter.org>");
+MODULE_DESCRIPTION("iptables helper match module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_helper_info *info = matchinfo;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	int ret = info->invert;
+	
+	ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo);
+	if (!ct) {
+		DEBUGP("ipt_helper: Eek! invalid conntrack?\n");
+		return ret;
+	}
+
+	if (!ct->master) {
+		DEBUGP("ipt_helper: conntrack %p has no master\n", ct);
+		return ret;
+	}
+
+	READ_LOCK(&ip_conntrack_lock);
+	if (!ct->master->helper) {
+		DEBUGP("ipt_helper: master ct %p has no helper\n", 
+			exp->expectant);
+		goto out_unlock;
+	}
+
+	DEBUGP("master's name = %s , info->name = %s\n", 
+		ct->master->helper->name, info->name);
+
+	if (info->name[0] == '\0')
+		ret ^= 1;
+	else
+		ret ^= !strncmp(ct->master->helper->name, info->name, 
+		                strlen(ct->master->helper->name));
+out_unlock:
+	READ_UNLOCK(&ip_conntrack_lock);
+	return ret;
+}
+
+static int check(const char *tablename,
+		 const struct ipt_ip *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	struct ipt_helper_info *info = matchinfo;
+
+	info->name[29] = '\0';
+
+	/* verify size */
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match helper_match = {
+	.name		= "helper",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	need_ip_conntrack();
+	return ipt_register_match(&helper_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&helper_match);
+}
+
+module_init(init);
+module_exit(fini);
+
diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c
new file mode 100644
index 000000000000..b835b7b2e560
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_iprange.c
@@ -0,0 +1,99 @@
+/*
+ * iptables module to match IP address ranges
+ *
+ * (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_iprange.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>");
+MODULE_DESCRIPTION("iptables arbitrary IP range match module");
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset, int *hotdrop)
+{
+	const struct ipt_iprange_info *info = matchinfo;
+	const struct iphdr *iph = skb->nh.iph;
+
+	if (info->flags & IPRANGE_SRC) {
+		if (((ntohl(iph->saddr) < ntohl(info->src.min_ip))
+			  || (ntohl(iph->saddr) > ntohl(info->src.max_ip)))
+			 ^ !!(info->flags & IPRANGE_SRC_INV)) {
+			DEBUGP("src IP %u.%u.%u.%u NOT in range %s"
+			       "%u.%u.%u.%u-%u.%u.%u.%u\n",
+				NIPQUAD(iph->saddr),
+			        info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+				NIPQUAD(info->src.min_ip),
+				NIPQUAD(info->src.max_ip));
+			return 0;
+		}
+	}
+	if (info->flags & IPRANGE_DST) {
+		if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip))
+			  || (ntohl(iph->daddr) > ntohl(info->dst.max_ip)))
+			 ^ !!(info->flags & IPRANGE_DST_INV)) {
+			DEBUGP("dst IP %u.%u.%u.%u NOT in range %s"
+			       "%u.%u.%u.%u-%u.%u.%u.%u\n",
+				NIPQUAD(iph->daddr),
+			        info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+				NIPQUAD(info->dst.min_ip),
+				NIPQUAD(info->dst.max_ip));
+			return 0;
+		}
+	}
+	return 1;
+}
+
+static int check(const char *tablename,
+		 const struct ipt_ip *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	/* verify size */
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match iprange_match = 
+{ 
+	.list = { NULL, NULL }, 
+	.name = "iprange", 
+	.match = &match, 
+	.checkentry = &check, 
+	.destroy = NULL, 
+	.me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&iprange_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&iprange_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c
new file mode 100644
index 000000000000..4eabcfbda9d1
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_length.c
@@ -0,0 +1,64 @@
+/* Kernel module to match packet length. */
+/* (C) 1999-2001 James Morris <jmorros@intercode.com.au>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_length.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
+MODULE_DESCRIPTION("IP tables packet length matching module");
+MODULE_LICENSE("GPL");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_length_info *info = matchinfo;
+	u_int16_t pktlen = ntohs(skb->nh.iph->tot_len);
+	
+	return (pktlen >= info->min && pktlen <= info->max) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match length_match = {
+	.name		= "length",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&length_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&length_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c
new file mode 100644
index 000000000000..0c24dcc703a5
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_limit.c
@@ -0,0 +1,157 @@
+/* Kernel module to control the rate
+ *
+ * 2 September 1999: Changed from the target RATE to the match
+ *                   `limit', removed logging.  Did I mention that
+ *                   Alexey is a fucking genius?
+ *                   Rusty Russell (rusty@rustcorp.com.au).  */
+
+/* (C) 1999 J�r�me de Vivie <devivie@info.enserb.u-bordeaux.fr>
+ * (C) 1999 Herv� Eychenne <eychenne@info.enserb.u-bordeaux.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_limit.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
+MODULE_DESCRIPTION("iptables rate limit match");
+
+/* The algorithm used is the Simple Token Bucket Filter (TBF)
+ * see net/sched/sch_tbf.c in the linux source tree
+ */
+
+static DEFINE_SPINLOCK(limit_lock);
+
+/* Rusty: This is my (non-mathematically-inclined) understanding of
+   this algorithm.  The `average rate' in jiffies becomes your initial
+   amount of credit `credit' and the most credit you can ever have
+   `credit_cap'.  The `peak rate' becomes the cost of passing the
+   test, `cost'.
+
+   `prev' tracks the last packet hit: you gain one credit per jiffy.
+   If you get credit balance more than this, the extra credit is
+   discarded.  Every time the match passes, you lose `cost' credits;
+   if you don't have that many, the test fails.
+
+   See Alexey's formal explanation in net/sched/sch_tbf.c.
+
+   To get the maxmum range, we multiply by this factor (ie. you get N
+   credits per jiffy).  We want to allow a rate as low as 1 per day
+   (slowest userspace tool allows), which means
+   CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */
+#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24))
+
+/* Repeated shift and or gives us all 1s, final shift and add 1 gives
+ * us the power of 2 below the theoretical max, so GCC simply does a
+ * shift. */
+#define _POW2_BELOW2(x) ((x)|((x)>>1))
+#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2))
+#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4))
+#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8))
+#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16))
+#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1)
+
+#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ)
+
+static int
+ipt_limit_match(const struct sk_buff *skb,
+		const struct net_device *in,
+		const struct net_device *out,
+		const void *matchinfo,
+		int offset,
+		int *hotdrop)
+{
+	struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master;
+	unsigned long now = jiffies;
+
+	spin_lock_bh(&limit_lock);
+	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
+	if (r->credit > r->credit_cap)
+		r->credit = r->credit_cap;
+
+	if (r->credit >= r->cost) {
+		/* We're not limited. */
+		r->credit -= r->cost;
+		spin_unlock_bh(&limit_lock);
+		return 1;
+	}
+
+       	spin_unlock_bh(&limit_lock);
+	return 0;
+}
+
+/* Precision saver. */
+static u_int32_t
+user2credits(u_int32_t user)
+{
+	/* If multiplying would overflow... */
+	if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY))
+		/* Divide first. */
+		return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY;
+
+	return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE;
+}
+
+static int
+ipt_limit_checkentry(const char *tablename,
+		     const struct ipt_ip *ip,
+		     void *matchinfo,
+		     unsigned int matchsize,
+		     unsigned int hook_mask)
+{
+	struct ipt_rateinfo *r = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo)))
+		return 0;
+
+	/* Check for overflow. */
+	if (r->burst == 0
+	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
+		printk("Overflow in ipt_limit, try lower: %u/%u\n",
+		       r->avg, r->burst);
+		return 0;
+	}
+
+	/* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies *
+	   128. */
+	r->prev = jiffies;
+	r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
+	r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
+	r->cost = user2credits(r->avg);
+
+	/* For SMP, we only want to use one set of counters. */
+	r->master = r;
+
+	return 1;
+}
+
+static struct ipt_match ipt_limit_reg = {
+	.name		= "limit",
+	.match		= ipt_limit_match,
+	.checkentry	= ipt_limit_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	if (ipt_register_match(&ipt_limit_reg))
+		return -EINVAL;
+	return 0;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&ipt_limit_reg);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c
new file mode 100644
index 000000000000..11a459e33f25
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mac.c
@@ -0,0 +1,79 @@
+/* Kernel module to match MAC address parameters. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+
+#include <linux/netfilter_ipv4/ipt_mac.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mac matching module");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+    const struct ipt_mac_info *info = matchinfo;
+
+    /* Is mac pointer valid? */
+    return (skb->mac.raw >= skb->head
+	    && (skb->mac.raw + ETH_HLEN) <= skb->data
+	    /* If so, compare... */
+	    && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN)
+		== 0) ^ info->invert));
+}
+
+static int
+ipt_mac_checkentry(const char *tablename,
+		   const struct ipt_ip *ip,
+		   void *matchinfo,
+		   unsigned int matchsize,
+		   unsigned int hook_mask)
+{
+	/* FORWARD isn't always valid, but it's nice to be able to do --RR */
+	if (hook_mask
+	    & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+		| (1 << NF_IP_FORWARD))) {
+		printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+		return 0;
+	}
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match mac_match = {
+	.name		= "mac",
+	.match		= &match,
+	.checkentry	= &ipt_mac_checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&mac_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&mac_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c
new file mode 100644
index 000000000000..8955728127b9
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_mark.c
@@ -0,0 +1,64 @@
+/* Kernel module to match NFMARK values. */
+
+/* (C) 1999-2001 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_mark.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables mark matching module");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_mark_info *info = matchinfo;
+
+	return ((skb->nfmark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match mark_match = {
+	.name		= "mark",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&mark_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&mark_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c
new file mode 100644
index 000000000000..99e8188162e2
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_multiport.c
@@ -0,0 +1,212 @@
+/* Kernel module to match one of a list of TCP/UDP ports: ports are in
+   the same place so we can treat them as equal. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/udp.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_multiport.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables multiple port match module");
+
+#if 0
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags,
+	    u_int8_t count, u_int16_t src, u_int16_t dst)
+{
+	unsigned int i;
+	for (i=0; i<count; i++) {
+		if (flags != IPT_MULTIPORT_DESTINATION
+		    && portlist[i] == src)
+			return 1;
+
+		if (flags != IPT_MULTIPORT_SOURCE
+		    && portlist[i] == dst)
+			return 1;
+	}
+
+	return 0;
+}
+
+/* Returns 1 if the port is matched by the test, 0 otherwise. */
+static inline int
+ports_match_v1(const struct ipt_multiport_v1 *minfo,
+	       u_int16_t src, u_int16_t dst)
+{
+	unsigned int i;
+	u_int16_t s, e;
+
+	for (i=0; i < minfo->count; i++) {
+		s = minfo->ports[i];
+
+		if (minfo->pflags[i]) {
+			/* range port matching */
+			e = minfo->ports[++i];
+			duprintf("src or dst matches with %d-%d?\n", s, e);
+
+			if (minfo->flags == IPT_MULTIPORT_SOURCE
+			    && src >= s && src <= e)
+				return 1 ^ minfo->invert;
+			if (minfo->flags == IPT_MULTIPORT_DESTINATION
+			    && dst >= s && dst <= e)
+				return 1 ^ minfo->invert;
+			if (minfo->flags == IPT_MULTIPORT_EITHER
+			    && ((dst >= s && dst <= e)
+				|| (src >= s && src <= e)))
+				return 1 ^ minfo->invert;
+		} else {
+			/* exact port matching */
+			duprintf("src or dst matches with %d?\n", s);
+
+			if (minfo->flags == IPT_MULTIPORT_SOURCE
+			    && src == s)
+				return 1 ^ minfo->invert;
+			if (minfo->flags == IPT_MULTIPORT_DESTINATION
+			    && dst == s)
+				return 1 ^ minfo->invert;
+			if (minfo->flags == IPT_MULTIPORT_EITHER
+			    && (src == s || dst == s))
+				return 1 ^ minfo->invert;
+		}
+	}
+ 
+ 	return minfo->invert;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	u16 _ports[2], *pptr;
+	const struct ipt_multiport *multiinfo = matchinfo;
+
+	if (offset)
+		return 0;
+
+	pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				  sizeof(_ports), _ports);
+	if (pptr == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("ipt_multiport:"
+			 " Dropping evil offset=0 tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return ports_match(multiinfo->ports,
+			   multiinfo->flags, multiinfo->count,
+			   ntohs(pptr[0]), ntohs(pptr[1]));
+}
+
+static int
+match_v1(const struct sk_buff *skb,
+	 const struct net_device *in,
+	 const struct net_device *out,
+	 const void *matchinfo,
+	 int offset,
+	 int *hotdrop)
+{
+	u16 _ports[2], *pptr;
+	const struct ipt_multiport_v1 *multiinfo = matchinfo;
+
+	if (offset)
+		return 0;
+
+	pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				  sizeof(_ports), _ports);
+	if (pptr == NULL) {
+		/* We've been asked to examine this packet, and we
+		 * can't.  Hence, no choice but to drop.
+		 */
+		duprintf("ipt_multiport:"
+			 " Dropping evil offset=0 tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+	}
+
+	return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1]));
+}
+
+/* Called when user tries to insert an entry of this type. */
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport)));
+}
+
+static int
+checkentry_v1(const char *tablename,
+	      const struct ipt_ip *ip,
+	      void *matchinfo,
+	      unsigned int matchsize,
+	      unsigned int hook_mask)
+{
+	return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1)));
+}
+
+static struct ipt_match multiport_match = {
+	.name		= "multiport",
+	.revision	= 0,
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static struct ipt_match multiport_match_v1 = {
+	.name		= "multiport",
+	.revision	= 1,
+	.match		= &match_v1,
+	.checkentry	= &checkentry_v1,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	int err;
+
+	err = ipt_register_match(&multiport_match);
+	if (!err) {
+		err = ipt_register_match(&multiport_match_v1);
+		if (err)
+			ipt_unregister_match(&multiport_match);
+	}
+
+	return err;
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&multiport_match);
+	ipt_unregister_match(&multiport_match_v1);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
new file mode 100644
index 000000000000..3b9065e06381
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -0,0 +1,217 @@
+/* Kernel module to match various things tied to sockets associated with
+   locally generated outgoing packets. */
+
+/* (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+
+#include <linux/netfilter_ipv4/ipt_owner.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables owner match");
+
+static int
+match_comm(const struct sk_buff *skb, const char *comm)
+{
+	struct task_struct *g, *p;
+	struct files_struct *files;
+	int i;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		if(strncmp(p->comm, comm, sizeof(p->comm)))
+			continue;
+
+		task_lock(p);
+		files = p->files;
+		if(files) {
+			spin_lock(&files->file_lock);
+			for (i=0; i < files->max_fds; i++) {
+				if (fcheck_files(files, i) ==
+				    skb->sk->sk_socket->file) {
+					spin_unlock(&files->file_lock);
+					task_unlock(p);
+					read_unlock(&tasklist_lock);
+					return 1;
+				}
+			}
+			spin_unlock(&files->file_lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	return 0;
+}
+
+static int
+match_pid(const struct sk_buff *skb, pid_t pid)
+{
+	struct task_struct *p;
+	struct files_struct *files;
+	int i;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p)
+		goto out;
+	task_lock(p);
+	files = p->files;
+	if(files) {
+		spin_lock(&files->file_lock);
+		for (i=0; i < files->max_fds; i++) {
+			if (fcheck_files(files, i) ==
+			    skb->sk->sk_socket->file) {
+				spin_unlock(&files->file_lock);
+				task_unlock(p);
+				read_unlock(&tasklist_lock);
+				return 1;
+			}
+		}
+		spin_unlock(&files->file_lock);
+	}
+	task_unlock(p);
+out:
+	read_unlock(&tasklist_lock);
+	return 0;
+}
+
+static int
+match_sid(const struct sk_buff *skb, pid_t sid)
+{
+	struct task_struct *g, *p;
+	struct file *file = skb->sk->sk_socket->file;
+	int i, found=0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		struct files_struct *files;
+		if (p->signal->session != sid)
+			continue;
+
+		task_lock(p);
+		files = p->files;
+		if (files) {
+			spin_lock(&files->file_lock);
+			for (i=0; i < files->max_fds; i++) {
+				if (fcheck_files(files, i) == file) {
+					found = 1;
+					break;
+				}
+			}
+			spin_unlock(&files->file_lock);
+		}
+		task_unlock(p);
+		if (found)
+			goto out;
+	} while_each_thread(g, p);
+out:
+	read_unlock(&tasklist_lock);
+
+	return found;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_owner_info *info = matchinfo;
+
+	if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file)
+		return 0;
+
+	if(info->match & IPT_OWNER_UID) {
+		if ((skb->sk->sk_socket->file->f_uid != info->uid) ^
+		    !!(info->invert & IPT_OWNER_UID))
+			return 0;
+	}
+
+	if(info->match & IPT_OWNER_GID) {
+		if ((skb->sk->sk_socket->file->f_gid != info->gid) ^
+		    !!(info->invert & IPT_OWNER_GID))
+			return 0;
+	}
+
+	if(info->match & IPT_OWNER_PID) {
+		if (!match_pid(skb, info->pid) ^
+		    !!(info->invert & IPT_OWNER_PID))
+			return 0;
+	}
+
+	if(info->match & IPT_OWNER_SID) {
+		if (!match_sid(skb, info->sid) ^
+		    !!(info->invert & IPT_OWNER_SID))
+			return 0;
+	}
+
+	if(info->match & IPT_OWNER_COMM) {
+		if (!match_comm(skb, info->comm) ^
+		    !!(info->invert & IPT_OWNER_COMM))
+			return 0;
+	}
+
+	return 1;
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+        if (hook_mask
+            & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) {
+                printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n");
+                return 0;
+        }
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) {
+		printk("Matchsize %u != %Zu\n", matchsize,
+		       IPT_ALIGN(sizeof(struct ipt_owner_info)));
+		return 0;
+	}
+#ifdef CONFIG_SMP
+	/* files->file_lock can not be used in a BH */
+	if (((struct ipt_owner_info *)matchinfo)->match
+	    & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) {
+		printk("ipt_owner: pid, sid and command matching is broken "
+		       "on SMP.\n");
+		return 0;
+	}
+#endif
+	return 1;
+}
+
+static struct ipt_match owner_match = {
+	.name		= "owner",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&owner_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&owner_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c
new file mode 100644
index 000000000000..1a53924041fc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_physdev.c
@@ -0,0 +1,134 @@
+/* Kernel module to match the bridge port in and
+ * out device for IP packets coming into contact with a bridge. */
+
+/* (C) 2001-2003 Bart De Schuymer <bdschuym@pandora.be>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ipt_physdev.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_bridge.h>
+#define MATCH   1
+#define NOMATCH 0
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Bart De Schuymer <bdschuym@pandora.be>");
+MODULE_DESCRIPTION("iptables bridge physical device match module");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	int i;
+	static const char nulldevname[IFNAMSIZ];
+	const struct ipt_physdev_info *info = matchinfo;
+	unsigned int ret;
+	const char *indev, *outdev;
+	struct nf_bridge_info *nf_bridge;
+
+	/* Not a bridged IP packet or no info available yet:
+	 * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if
+	 * the destination device will be a bridge. */
+	if (!(nf_bridge = skb->nf_bridge)) {
+		/* Return MATCH if the invert flags of the used options are on */
+		if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
+		    !(info->invert & IPT_PHYSDEV_OP_BRIDGED))
+			return NOMATCH;
+		if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) &&
+		    !(info->invert & IPT_PHYSDEV_OP_ISIN))
+			return NOMATCH;
+		if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) &&
+		    !(info->invert & IPT_PHYSDEV_OP_ISOUT))
+			return NOMATCH;
+		if ((info->bitmask & IPT_PHYSDEV_OP_IN) &&
+		    !(info->invert & IPT_PHYSDEV_OP_IN))
+			return NOMATCH;
+		if ((info->bitmask & IPT_PHYSDEV_OP_OUT) &&
+		    !(info->invert & IPT_PHYSDEV_OP_OUT))
+			return NOMATCH;
+		return MATCH;
+	}
+
+	/* This only makes sense in the FORWARD and POSTROUTING chains */
+	if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) &&
+	    (!!(nf_bridge->mask & BRNF_BRIDGED) ^
+	    !(info->invert & IPT_PHYSDEV_OP_BRIDGED)))
+		return NOMATCH;
+
+	if ((info->bitmask & IPT_PHYSDEV_OP_ISIN &&
+	    (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) ||
+	    (info->bitmask & IPT_PHYSDEV_OP_ISOUT &&
+	    (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT))))
+		return NOMATCH;
+
+	if (!(info->bitmask & IPT_PHYSDEV_OP_IN))
+		goto match_outdev;
+	indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname;
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+		ret |= (((const unsigned int *)indev)[i]
+			^ ((const unsigned int *)info->physindev)[i])
+			& ((const unsigned int *)info->in_mask)[i];
+	}
+
+	if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN))
+		return NOMATCH;
+
+match_outdev:
+	if (!(info->bitmask & IPT_PHYSDEV_OP_OUT))
+		return MATCH;
+	outdev = nf_bridge->physoutdev ?
+		 nf_bridge->physoutdev->name : nulldevname;
+	for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) {
+		ret |= (((const unsigned int *)outdev)[i]
+			^ ((const unsigned int *)info->physoutdev)[i])
+			& ((const unsigned int *)info->out_mask)[i];
+	}
+
+	return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT);
+}
+
+static int
+checkentry(const char *tablename,
+		       const struct ipt_ip *ip,
+		       void *matchinfo,
+		       unsigned int matchsize,
+		       unsigned int hook_mask)
+{
+	const struct ipt_physdev_info *info = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info)))
+		return 0;
+	if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) ||
+	    info->bitmask & ~IPT_PHYSDEV_OP_MASK)
+		return 0;
+	return 1;
+}
+
+static struct ipt_match physdev_match = {
+	.name		= "physdev",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&physdev_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&physdev_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c
new file mode 100644
index 000000000000..8ddb1dc5e5ae
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_pkttype.c
@@ -0,0 +1,70 @@
+/* (C) 1999-2001 Michal Ludvig <michal@logix.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/if_ether.h>
+#include <linux/if_packet.h>
+
+#include <linux/netfilter_ipv4/ipt_pkttype.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Michal Ludvig <michal@logix.cz>");
+MODULE_DESCRIPTION("IP tables match to match on linklayer packet type");
+
+static int match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+    const struct ipt_pkttype_info *info = matchinfo;
+
+    return (skb->pkt_type == info->pkttype) ^ info->invert;
+}
+
+static int checkentry(const char *tablename,
+		   const struct ipt_ip *ip,
+		   void *matchinfo,
+		   unsigned int matchsize,
+		   unsigned int hook_mask)
+{
+/*
+	if (hook_mask
+	    & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN)
+		| (1 << NF_IP_FORWARD))) {
+		printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n");
+		return 0;
+	}
+*/
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match pkttype_match = {
+	.name		= "pkttype",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&pkttype_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&pkttype_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c
new file mode 100644
index 000000000000..54a6897ebaa6
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_realm.c
@@ -0,0 +1,76 @@
+/* IP tables module for matching the routing realm
+ *
+ * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $
+ *
+ * (C) 2003 by Sampsa Ranta <sampsa@netsonic.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <net/route.h>
+
+#include <linux/netfilter_ipv4/ipt_realm.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("Sampsa Ranta <sampsa@netsonic.fi>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables realm match");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_realm_info *info = matchinfo;
+	struct dst_entry *dst = skb->dst;
+    
+	return (info->id == (dst->tclassid & info->mask)) ^ info->invert;
+}
+
+static int check(const char *tablename,
+                 const struct ipt_ip *ip,
+                 void *matchinfo,
+                 unsigned int matchsize,
+                 unsigned int hook_mask)
+{
+	if (hook_mask
+	    & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) |
+	        (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) {
+		printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, "
+		       "LOCAL_IN or FORWARD.\n");
+		return 0;
+	}
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) {
+		printk("ipt_realm: invalid matchsize.\n");
+		return 0;
+	}
+	return 1;
+}
+
+static struct ipt_match realm_match = {
+	.name		= "realm",
+	.match		= match, 
+	.checkentry	= check,
+	.me		= THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&realm_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&realm_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c
new file mode 100644
index 000000000000..25ab9fabdcba
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_recent.c
@@ -0,0 +1,1002 @@
+/* Kernel module to check if the source address has been seen recently. */
+/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */
+/* Author: Stephen Frost <sfrost@snowman.net> */
+/* Project Page: http://snowman.net/projects/ipt_recent/ */
+/* This software is distributed under the terms of the GPL, Version 2 */
+/* This copyright does not cover user programs that use kernel services
+ * by normal system calls. */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <linux/ctype.h>
+#include <linux/ip.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_recent.h>
+
+#undef DEBUG
+#define HASH_LOG 9
+
+/* Defaults, these can be overridden on the module command-line. */
+static int ip_list_tot = 100;
+static int ip_pkt_list_tot = 20;
+static int ip_list_hash_size = 0;
+static int ip_list_perms = 0644;
+#ifdef DEBUG
+static int debug = 1;
+#endif
+
+static char version[] =
+KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>.  http://snowman.net/projects/ipt_recent/\n";
+
+MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>");
+MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER);
+MODULE_LICENSE("GPL");
+module_param(ip_list_tot, int, 0400);
+module_param(ip_pkt_list_tot, int, 0400);
+module_param(ip_list_hash_size, int, 0400);
+module_param(ip_list_perms, int, 0400);
+#ifdef DEBUG
+module_param(debug, int, 0600);
+MODULE_PARM_DESC(debug,"debugging level, defaults to 1");
+#endif
+MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list");
+MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember");
+MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs");
+MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files");
+
+/* Structure of our list of recently seen addresses. */
+struct recent_ip_list {
+	u_int32_t addr;
+	u_int8_t  ttl;
+	unsigned long last_seen;
+	unsigned long *last_pkts;
+	u_int32_t oldest_pkt;
+	u_int32_t hash_entry;
+	u_int32_t time_pos;
+};
+
+struct time_info_list {
+	u_int32_t position;
+	u_int32_t time;
+};
+
+/* Structure of our linked list of tables of recent lists. */
+struct recent_ip_tables {
+	char name[IPT_RECENT_NAME_LEN];
+	int count;
+	int time_pos;
+	struct recent_ip_list *table;
+	struct recent_ip_tables *next;
+	spinlock_t list_lock;
+	int *hash_table;
+	struct time_info_list *time_info;
+#ifdef CONFIG_PROC_FS
+	struct proc_dir_entry *status_proc;
+#endif /* CONFIG_PROC_FS */
+};
+
+/* Our current list of addresses we have recently seen.
+ * Only added to on a --set, and only updated on --set || --update 
+ */
+static struct recent_ip_tables *r_tables = NULL;
+
+/* We protect r_list with this spinlock so two processors are not modifying
+ * the list at the same time. 
+ */
+static DEFINE_SPINLOCK(recent_lock);
+
+#ifdef CONFIG_PROC_FS
+/* Our /proc/net/ipt_recent entry */
+static struct proc_dir_entry *proc_net_ipt_recent = NULL;
+#endif
+
+/* Function declaration for later. */
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop);
+
+/* Function to hash a given address into the hash table of table_size size */
+static int hash_func(unsigned int addr, int table_size)
+{
+	int result = 0;
+	unsigned int value = addr;
+	do { result ^= value; } while((value >>= HASH_LOG));
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n",
+			 result & (table_size - 1),
+			 addr,
+			 table_size);
+#endif
+
+	return(result & (table_size - 1));
+}
+
+#ifdef CONFIG_PROC_FS
+/* This is the function which produces the output for our /proc output
+ * interface which lists each IP address, the last seen time and the 
+ * other recent times the address was seen.
+ */
+
+static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data)
+{
+	int len = 0, count, last_len = 0, pkt_count;
+	off_t pos = 0;
+	off_t begin = 0;
+	struct recent_ip_tables *curr_table;
+
+	curr_table = (struct recent_ip_tables*) data;
+
+	spin_lock_bh(&curr_table->list_lock);
+	for(count = 0; count < ip_list_tot; count++) {
+		if(!curr_table->table[count].addr) continue;
+		last_len = len;
+		len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr));
+		len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl);
+		len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen);
+		len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt);
+		len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]);
+		for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) {
+			if(!curr_table->table[count].last_pkts[pkt_count]) break;
+			len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]);
+		}
+		len += sprintf(buffer+len,"\n");
+		pos = begin + len;
+		if(pos < offset) { len = 0; begin = pos; }
+		if(pos > offset + length) { len = last_len; break; }
+	}
+
+	*start = buffer + (offset - begin);
+	len -= (offset - begin);
+	if(len > length) len = length;
+
+	spin_unlock_bh(&curr_table->list_lock);
+	return len;
+}
+
+/* ip_recent_ctrl provides an interface for users to modify the table
+ * directly.  This allows adding entries, removing entries, and
+ * flushing the entire table.
+ * This is done by opening up the appropriate table for writing and
+ * sending one of:
+ * xx.xx.xx.xx   -- Add entry to table with current time
+ * +xx.xx.xx.xx  -- Add entry to table with current time
+ * -xx.xx.xx.xx  -- Remove entry from table
+ * clear         -- Flush table, remove all entries
+ */
+
+static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data)
+{
+	static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff };
+	u_int32_t val;
+	int base, used = 0;
+	char c, *cp;
+	union iaddr {
+		uint8_t bytes[4];
+		uint32_t word;
+	} res;
+	uint8_t *pp = res.bytes;
+	int digit;
+
+	char buffer[20];
+	int len, check_set = 0, count;
+	u_int32_t addr = 0;
+	struct sk_buff *skb;
+	struct ipt_recent_info *info;
+	struct recent_ip_tables *curr_table;
+
+	curr_table = (struct recent_ip_tables*) data;
+
+	if(size > 20) len = 20; else len = size;
+
+	if(copy_from_user(buffer,input,len)) return -EFAULT;
+
+	if(len < 20) buffer[len] = '\0';
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer);
+#endif
+
+	cp = buffer;
+	while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; }
+
+	/* Check if we are asked to flush the entire table */
+	if(!memcmp(cp,"clear",5)) {
+		used += 5;
+		spin_lock_bh(&curr_table->list_lock);
+		curr_table->time_pos = 0;
+		for(count = 0; count < ip_list_hash_size; count++) {
+			curr_table->hash_table[count] = -1;
+		}
+		for(count = 0; count < ip_list_tot; count++) {
+			curr_table->table[count].last_seen = 0;
+			curr_table->table[count].addr = 0;
+			curr_table->table[count].ttl = 0;
+			memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+			curr_table->table[count].oldest_pkt = 0;
+			curr_table->table[count].time_pos = 0;
+			curr_table->time_info[count].position = count;
+			curr_table->time_info[count].time = 0;
+		}
+		spin_unlock_bh(&curr_table->list_lock);
+		return used;
+	}
+
+        check_set = IPT_RECENT_SET;
+	switch(*cp) {
+		case '+': check_set = IPT_RECENT_SET; cp++; used++; break;
+		case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break;
+		default: if(!isdigit(*cp)) return (used+1); break;
+	}
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set);
+#endif
+	/* Get addr (effectively inet_aton()) */
+	/* Shamelessly stolen from libc, a function in the kernel for doing
+	 * this would, of course, be greatly preferred, but our options appear
+	 * to be rather limited, so we will just do it ourselves here.
+	 */
+	res.word = 0;
+
+	c = *cp;
+	for(;;) {
+		if(!isdigit(c)) return used;
+		val = 0; base = 10; digit = 0;
+		if(c == '0') {
+			c = *++cp;
+			if(c == 'x' || c == 'X') base = 16, c = *++cp;
+			else { base = 8; digit = 1; }
+		}
+		for(;;) {
+			if(isascii(c) && isdigit(c)) {
+				if(base == 8 && (c == '8' || c == '0')) return used;
+				val = (val * base) + (c - '0');
+				c = *++cp;
+				digit = 1;
+			} else if(base == 16 && isascii(c) && isxdigit(c)) {
+				val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A'));
+				c = *++cp;
+				digit = 1;
+			} else break;
+		}
+		if(c == '.') {
+			if(pp > res.bytes + 2 || val > 0xff) return used;
+			*pp++ = val;
+			c = *++cp;
+		} else break;
+	}
+	used = cp - buffer;
+	if(c != '\0' && (!isascii(c) || !isspace(c))) return used;
+	if(c == '\n') used++;
+	if(!digit) return used;
+
+	if(val > max[pp - res.bytes]) return used;
+	addr = res.word | htonl(val);
+
+	if(!addr && check_set == IPT_RECENT_SET) return used;
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used);
+#endif
+
+	/* Set up and just call match */
+	info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL);
+	if(!info) { return -ENOMEM; }
+	info->seconds = 0;
+	info->hit_count = 0;
+	info->check_set = check_set;
+	info->invert = 0;
+	info->side = IPT_RECENT_SOURCE;
+	strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN);
+	info->name[IPT_RECENT_NAME_LEN-1] = '\0';
+
+	skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL);
+	if (!skb) {
+		used = -ENOMEM;
+		goto out_free_info;
+	}
+	skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL);
+	if (!skb->nh.iph) {
+		used = -ENOMEM;
+		goto out_free_skb;
+	}
+
+	skb->nh.iph->saddr = addr;
+	skb->nh.iph->daddr = 0;
+	/* Clear ttl since we have no way of knowing it */
+	skb->nh.iph->ttl = 0;
+	match(skb,NULL,NULL,info,0,NULL);
+
+	kfree(skb->nh.iph);
+out_free_skb:
+	kfree(skb);
+out_free_info:
+	kfree(info);
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used);
+#endif
+	return used;
+}
+
+#endif /* CONFIG_PROC_FS */
+
+/* 'match' is our primary function, called by the kernel whenever a rule is
+ * hit with our module as an option to it.
+ * What this function does depends on what was specifically asked of it by
+ * the user:
+ * --set -- Add or update last seen time of the source address of the packet
+ *   -- matchinfo->check_set == IPT_RECENT_SET
+ * --rcheck -- Just check if the source address is in the list
+ *   -- matchinfo->check_set == IPT_RECENT_CHECK
+ * --update -- If the source address is in the list, update last_seen
+ *   -- matchinfo->check_set == IPT_RECENT_UPDATE
+ * --remove -- If the source address is in the list, remove it
+ *   -- matchinfo->check_set == IPT_RECENT_REMOVE
+ * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds
+ *   -- matchinfo->seconds
+ * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times
+ *   -- matchinfo->hit_count
+ * --seconds and --hitcount can be combined
+ */
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	int pkt_count, hits_found, ans;
+	unsigned long now;
+	const struct ipt_recent_info *info = matchinfo;
+	u_int32_t addr = 0, time_temp;
+	u_int8_t ttl = skb->nh.iph->ttl;
+	int *hash_table;
+	int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1;
+	struct time_info_list *time_info;
+	struct recent_ip_tables *curr_table;
+	struct recent_ip_tables *last_table;
+	struct recent_ip_list *r_list;
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n");
+#endif
+
+	/* Default is false ^ info->invert */
+	ans = info->invert;
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name);
+#endif
+
+	/* if out != NULL then routing has been done and TTL changed.
+	 * We change it back here internally for match what came in before routing. */
+	if(out) ttl++;
+
+	/* Find the right table */
+	spin_lock_bh(&recent_lock);
+	curr_table = r_tables;
+	while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) );
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name);
+#endif
+
+	spin_unlock_bh(&recent_lock);
+
+	/* Table with this name not found, match impossible */
+	if(!curr_table) { return ans; }
+
+	/* Make sure no one is changing the list while we work with it */
+	spin_lock_bh(&curr_table->list_lock);
+
+	r_list = curr_table->table;
+	if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr;
+
+	if(!addr) { 
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr);
+#endif
+		spin_unlock_bh(&curr_table->list_lock);
+		return ans;
+	}
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl);
+#endif
+
+	/* Get jiffies now in case they changed while we were waiting for a lock */
+	now = jiffies;
+	hash_table = curr_table->hash_table;
+	time_info = curr_table->time_info;
+
+	orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size);
+	/* Hash entry at this result used */
+	/* Check for TTL match if requested.  If TTL is zero then a match would never
+	 * happen, so match regardless of existing TTL in that case.  Zero means the
+	 * entry was added via the /proc interface anyway, so we will just use the
+	 * first TTL we get for that IP address. */
+	if(info->check_set & IPT_RECENT_TTL) {
+		while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr &&
+			(!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) {
+			/* Collision in hash table */
+			hash_result = (hash_result + 1) % ip_list_hash_size;
+		}
+	} else {
+		while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) {
+			/* Collision in hash table */
+			hash_result = (hash_result + 1) % ip_list_hash_size;
+		}
+	}
+
+	if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) {
+		/* IP not in list and not asked to SET */
+		spin_unlock_bh(&curr_table->list_lock);
+		return ans;
+	}
+
+	/* Check if we need to handle the collision, do not need to on REMOVE */
+	if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) {
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n",
+				 orig_hash_result,
+				 hash_result,
+				 r_list[hash_table[orig_hash_result]].addr,
+				 addr);
+#endif
+
+		/* We had a collision.
+		 * orig_hash_result is where we started, hash_result is where we ended up.
+		 * So, swap them because we are likely to see the same guy again sooner */
+#ifdef DEBUG
+		if(debug) {
+		  printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]);
+		  printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n",
+				r_list[hash_table[orig_hash_result]].hash_entry);
+		}
+#endif
+
+		r_list[hash_table[orig_hash_result]].hash_entry = hash_result;
+
+
+		temp = hash_table[orig_hash_result];
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]);
+#endif
+		hash_table[orig_hash_result] = hash_table[hash_result];
+		hash_table[hash_result] = temp;
+		temp = hash_result;
+		hash_result = orig_hash_result;
+		orig_hash_result = temp;
+		time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result];
+		if(hash_table[hash_result] != -1) {
+			r_list[hash_table[hash_result]].hash_entry = hash_result;
+			time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+		}
+
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n");
+#endif
+	}
+
+	if(hash_table[hash_result] == -1) {
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n",
+				 hash_result, addr);
+#endif
+
+		/* New item found and IPT_RECENT_SET, so we need to add it */
+		location = time_info[curr_table->time_pos].position;
+		hash_table[r_list[location].hash_entry] = -1;
+		hash_table[hash_result] = location;
+		memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+		r_list[location].time_pos = curr_table->time_pos;
+		r_list[location].addr = addr;
+		r_list[location].ttl = ttl;
+		r_list[location].last_seen = now;
+		r_list[location].oldest_pkt = 1;
+		r_list[location].last_pkts[0] = now;
+		r_list[location].hash_entry = hash_result;
+		time_info[curr_table->time_pos].time = r_list[location].last_seen;
+		curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot;
+
+		ans = !info->invert;
+	} else {
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n",
+				 hash_result,
+				 addr);
+#endif
+
+		/* Existing item found */
+		location = hash_table[hash_result];
+		/* We have a match on address, now to make sure it meets all requirements for a
+		 * full match. */
+		if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) {
+			if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert;
+			if(info->seconds && !info->hit_count) {
+				if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert;
+			}
+			if(info->seconds && info->hit_count) {
+				for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+					if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++;
+				}
+				if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+			}
+			if(info->hit_count && !info->seconds) {
+				for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) {
+					if(r_list[location].last_pkts[pkt_count] == 0) break;
+					hits_found++;
+				}
+				if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert;
+			}
+		}
+#ifdef DEBUG
+		if(debug) {
+			if(ans)
+				printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr);
+			else
+				printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr);
+		}
+#endif
+
+		/* If and only if we have been asked to SET, or to UPDATE (on match) do we add the
+		 * current timestamp to the last_seen. */
+		if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) {
+#ifdef DEBUG
+			if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n");
+#endif
+			/* Have to update our time info */
+			time_loc = r_list[location].time_pos;
+			time_info[time_loc].time = now;
+			time_info[time_loc].position = location;
+			while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
+				time_temp = time_info[time_loc].time;
+				time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
+				time_info[(time_loc+1)%ip_list_tot].time = time_temp;
+				time_temp = time_info[time_loc].position;
+				time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
+				time_info[(time_loc+1)%ip_list_tot].position = time_temp;
+				r_list[time_info[time_loc].position].time_pos = time_loc;
+				r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
+				time_loc = (time_loc+1) % ip_list_tot;
+			}
+			r_list[location].time_pos = time_loc;
+			r_list[location].ttl = ttl;
+			r_list[location].last_pkts[r_list[location].oldest_pkt] = now;
+			r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot;
+			r_list[location].last_seen = now;
+		}
+		/* If we have been asked to remove the entry from the list, just set it to 0 */
+		if(info->check_set & IPT_RECENT_REMOVE) {
+#ifdef DEBUG
+			if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result);
+#endif
+			/* Check if this is part of a collision chain */
+			while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) {
+				orig_hash_result++;
+				if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) {
+					/* Found collision chain, how deep does this rabbit hole go? */
+#ifdef DEBUG
+					if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n");
+#endif
+					end_collision_chain = orig_hash_result;
+				}
+			}
+			if(end_collision_chain != -1) {
+#ifdef DEBUG
+				if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n");
+#endif
+				/* Part of a collision chain, swap it with the end of the chain
+				 * before removing. */
+				r_list[hash_table[end_collision_chain]].hash_entry = hash_result;
+				temp = hash_table[end_collision_chain];
+				hash_table[end_collision_chain] = hash_table[hash_result];
+				hash_table[hash_result] = temp;
+				time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+				hash_result = end_collision_chain;
+				r_list[hash_table[hash_result]].hash_entry = hash_result;
+				time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result];
+			}
+			location = hash_table[hash_result];
+			hash_table[r_list[location].hash_entry] = -1;
+			time_loc = r_list[location].time_pos;
+			time_info[time_loc].time = 0;
+			time_info[time_loc].position = location;
+			while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) {
+				time_temp = time_info[time_loc].time;
+				time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time;
+				time_info[(time_loc+1)%ip_list_tot].time = time_temp;
+				time_temp = time_info[time_loc].position;
+				time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position;
+				time_info[(time_loc+1)%ip_list_tot].position = time_temp;
+				r_list[time_info[time_loc].position].time_pos = time_loc;
+				r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot;
+				time_loc = (time_loc+1) % ip_list_tot;
+			}
+			r_list[location].time_pos = time_loc;
+			r_list[location].last_seen = 0;
+			r_list[location].addr = 0;
+			r_list[location].ttl = 0;
+			memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t));
+			r_list[location].oldest_pkt = 0;
+			ans = !info->invert;
+		}
+		spin_unlock_bh(&curr_table->list_lock);
+		return ans;
+	}
+
+	spin_unlock_bh(&curr_table->list_lock);
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n");
+#endif
+	return ans;
+}
+
+/* This function is to verify that the rule given during the userspace iptables
+ * command is correct.
+ * If the command is valid then we check if the table name referred to by the
+ * rule exists, if not it is created.
+ */
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	int flag = 0, c;
+	unsigned long *hold;
+	const struct ipt_recent_info *info = matchinfo;
+	struct recent_ip_tables *curr_table, *find_table, *last_table;
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n");
+#endif
+
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0;
+
+	/* seconds and hit_count only valid for CHECK/UPDATE */
+	if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; }
+	if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; }
+	if(info->check_set & IPT_RECENT_CHECK) flag++;
+	if(info->check_set & IPT_RECENT_UPDATE) flag++;
+
+	/* One and only one of these should ever be set */
+	if(flag != 1) return 0;
+
+	/* Name must be set to something */
+	if(!info->name || !info->name[0]) return 0;
+
+	/* Things look good, create a list for this if it does not exist */
+	/* Lock the linked list while we play with it */
+	spin_lock_bh(&recent_lock);
+
+	/* Look for an entry with this name already created */
+	/* Finds the end of the list and the entry before the end if current name does not exist */
+	find_table = r_tables;
+	while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+
+	/* If a table already exists just increment the count on that table and return */
+	if(find_table) { 
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name);
+#endif
+		find_table->count++;
+		spin_unlock_bh(&recent_lock);
+		return 1;
+	}
+
+	spin_unlock_bh(&recent_lock);
+
+	/* Table with this name not found */
+	/* Allocate memory for new linked list item */
+
+#ifdef DEBUG
+	if(debug) {
+		printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name);
+		printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables));
+	}
+#endif
+
+	curr_table = vmalloc(sizeof(struct recent_ip_tables));
+	if(curr_table == NULL) return 0;
+
+	spin_lock_init(&curr_table->list_lock);
+	curr_table->next = NULL;
+	curr_table->count = 1;
+	curr_table->time_pos = 0;
+	strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN);
+	curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0';
+
+	/* Allocate memory for this table and the list of packets in each entry. */
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n",
+			sizeof(struct recent_ip_list)*ip_list_tot,
+			info->name);
+#endif
+
+	curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot);
+	if(curr_table->table == NULL) { vfree(curr_table); return 0; }
+	memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot);
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n",
+			sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+#endif
+
+	hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot);
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n");
+#endif
+	if(hold == NULL) { 
+		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n");
+		vfree(curr_table->table); 
+		vfree(curr_table);
+		return 0;
+	}
+	for(c = 0; c < ip_list_tot; c++) {
+		curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot;
+	}
+
+	/* Allocate memory for the hash table */
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n",
+			sizeof(int)*ip_list_hash_size);
+#endif
+
+	curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size);
+	if(!curr_table->hash_table) {
+		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n");
+		vfree(hold);
+		vfree(curr_table->table); 
+		vfree(curr_table);
+		return 0;
+	}
+
+	for(c = 0; c < ip_list_hash_size; c++) {
+		curr_table->hash_table[c] = -1;
+	}
+
+	/* Allocate memory for the time info */
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n",
+			sizeof(struct time_info_list)*ip_list_tot);
+#endif
+
+	curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot);
+	if(!curr_table->time_info) {
+		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n");
+		vfree(curr_table->hash_table);
+		vfree(hold);
+		vfree(curr_table->table); 
+		vfree(curr_table);
+		return 0;
+	}
+	for(c = 0; c < ip_list_tot; c++) {
+		curr_table->time_info[c].position = c;
+		curr_table->time_info[c].time = 0;
+	}
+
+	/* Put the new table in place */
+	spin_lock_bh(&recent_lock);
+	find_table = r_tables;
+	while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) );
+
+	/* If a table already exists just increment the count on that table and return */
+	if(find_table) { 
+		find_table->count++;	
+		spin_unlock_bh(&recent_lock);
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name);
+#endif
+		vfree(curr_table->time_info);
+		vfree(curr_table->hash_table);
+		vfree(hold);
+		vfree(curr_table->table);
+		vfree(curr_table);
+		return 1;
+	}
+	if(!last_table) r_tables = curr_table; else last_table->next = curr_table;
+
+	spin_unlock_bh(&recent_lock);
+
+#ifdef CONFIG_PROC_FS
+	/* Create our proc 'status' entry. */
+	curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent);
+	if (!curr_table->status_proc) {
+		printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n");
+		/* Destroy the created table */
+		spin_lock_bh(&recent_lock);
+		last_table = NULL;
+		curr_table = r_tables;
+		if(!curr_table) {
+#ifdef DEBUG
+			if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n");
+#endif
+			spin_unlock_bh(&recent_lock);
+			return 0;
+		}
+		while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
+		if(!curr_table) {
+#ifdef DEBUG
+			if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n");
+#endif
+			spin_unlock_bh(&recent_lock);
+			return 0;
+		}
+		if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
+		spin_unlock_bh(&recent_lock);
+		vfree(curr_table->time_info);
+		vfree(curr_table->hash_table);
+		vfree(hold);
+		vfree(curr_table->table);
+		vfree(curr_table);
+		return 0;
+	}
+	
+	curr_table->status_proc->owner = THIS_MODULE;
+	curr_table->status_proc->data = curr_table;
+	wmb();
+	curr_table->status_proc->read_proc = ip_recent_get_info;
+	curr_table->status_proc->write_proc = ip_recent_ctrl;
+#endif /* CONFIG_PROC_FS */
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n");
+#endif
+
+	return 1;
+}
+
+/* This function is called in the event that a rule matching this module is
+ * removed.
+ * When this happens we need to check if there are no other rules matching
+ * the table given.  If that is the case then we remove the table and clean
+ * up its memory.
+ */
+static void
+destroy(void *matchinfo, unsigned int matchsize)
+{
+	const struct ipt_recent_info *info = matchinfo;
+	struct recent_ip_tables *curr_table, *last_table;
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n");
+#endif
+
+	if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return;
+
+	/* Lock the linked list while we play with it */
+	spin_lock_bh(&recent_lock);
+
+	/* Look for an entry with this name already created */
+	/* Finds the end of the list and the entry before the end if current name does not exist */
+	last_table = NULL;
+	curr_table = r_tables;
+	if(!curr_table) { 
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n");
+#endif
+		spin_unlock_bh(&recent_lock);
+		return;
+	}
+	while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) );
+
+	/* If a table does not exist then do nothing and return */
+	if(!curr_table) { 
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n");
+#endif
+		spin_unlock_bh(&recent_lock);
+		return;
+	}
+
+	curr_table->count--;
+
+	/* If count is still non-zero then there are still rules referenceing it so we do nothing */
+	if(curr_table->count) { 
+#ifdef DEBUG
+		if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n");
+#endif
+		spin_unlock_bh(&recent_lock);
+		return;
+	}
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n");
+#endif
+
+	/* Count must be zero so we remove this table from the list */
+	if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next;
+
+	spin_unlock_bh(&recent_lock);
+
+	/* lock to make sure any late-runners still using this after we removed it from
+	 * the list finish up then remove everything */
+	spin_lock_bh(&curr_table->list_lock);
+	spin_unlock_bh(&curr_table->list_lock);
+
+#ifdef CONFIG_PROC_FS
+	if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent);
+#endif /* CONFIG_PROC_FS */
+	vfree(curr_table->table[0].last_pkts);
+	vfree(curr_table->table);
+	vfree(curr_table->hash_table);
+	vfree(curr_table->time_info);
+	vfree(curr_table);
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n");
+#endif
+
+	return;
+}
+
+/* This is the structure we pass to ipt_register to register our
+ * module with iptables.
+ */
+static struct ipt_match recent_match = { 
+  .name = "recent", 
+  .match = &match, 
+  .checkentry = &checkentry, 
+  .destroy = &destroy, 
+  .me = THIS_MODULE
+};
+
+/* Kernel module initialization. */
+static int __init init(void)
+{
+	int err, count;
+
+	printk(version);
+#ifdef CONFIG_PROC_FS
+	proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net);
+	if(!proc_net_ipt_recent) return -ENOMEM;
+#endif
+
+	if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) {
+	  printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n");
+	  ip_list_hash_size = 0;
+	}
+
+	if(!ip_list_hash_size) {
+		ip_list_hash_size = ip_list_tot*3;
+		count = 2*2;
+		while(ip_list_hash_size > count) count = count*2;
+		ip_list_hash_size = count;
+	}
+
+#ifdef DEBUG
+	if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size);
+#endif
+
+	err = ipt_register_match(&recent_match);
+	if (err)
+		remove_proc_entry("ipt_recent", proc_net);
+	return err;
+}
+
+/* Kernel module destruction. */
+static void __exit fini(void)
+{
+	ipt_unregister_match(&recent_match);
+
+	remove_proc_entry("ipt_recent",proc_net);
+}
+
+/* Register our module with the kernel. */
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c
new file mode 100644
index 000000000000..fe2b327bcaa4
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_sctp.c
@@ -0,0 +1,203 @@
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/sctp.h>
+
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_sctp.h>
+
+#ifdef DEBUG_SCTP
+#define duprintf(format, args...) printk(format , ## args)
+#else
+#define duprintf(format, args...)
+#endif
+
+#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \
+					      || (!!((invflag) & (option)) ^ (cond)))
+
+static int
+match_flags(const struct ipt_sctp_flag_info *flag_info,
+	    const int flag_count,
+	    u_int8_t chunktype,
+	    u_int8_t chunkflags)
+{
+	int i;
+
+	for (i = 0; i < flag_count; i++) {
+		if (flag_info[i].chunktype == chunktype) {
+			return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag;
+		}
+	}
+
+	return 1;
+}
+
+static int
+match_packet(const struct sk_buff *skb,
+	     const u_int32_t *chunkmap,
+	     int chunk_match_type,
+	     const struct ipt_sctp_flag_info *flag_info,
+	     const int flag_count,
+	     int *hotdrop)
+{
+	int offset;
+	u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)];
+	sctp_chunkhdr_t _sch, *sch;
+
+#ifdef DEBUG_SCTP
+	int i = 0;
+#endif
+
+	if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) {
+		SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap);
+	}
+
+	offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t);
+	do {
+		sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch);
+		if (sch == NULL) {
+			duprintf("Dropping invalid SCTP packet.\n");
+			*hotdrop = 1;
+			return 0;
+        	}
+
+		duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", 
+				++i, offset, sch->type, htons(sch->length), sch->flags);
+
+		offset += (htons(sch->length) + 3) & ~3;
+
+		duprintf("skb->len: %d\toffset: %d\n", skb->len, offset);
+
+		if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) {
+			switch (chunk_match_type) {
+			case SCTP_CHUNK_MATCH_ANY:
+				if (match_flags(flag_info, flag_count, 
+					sch->type, sch->flags)) {
+					return 1;
+				}
+				break;
+
+			case SCTP_CHUNK_MATCH_ALL:
+				if (match_flags(flag_info, flag_count, 
+					sch->type, sch->flags)) {
+					SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type);
+				}
+				break;
+
+			case SCTP_CHUNK_MATCH_ONLY:
+				if (!match_flags(flag_info, flag_count, 
+					sch->type, sch->flags)) {
+					return 0;
+				}
+				break;
+			}
+		} else {
+			switch (chunk_match_type) {
+			case SCTP_CHUNK_MATCH_ONLY:
+				return 0;
+			}
+		}
+	} while (offset < skb->len);
+
+	switch (chunk_match_type) {
+	case SCTP_CHUNK_MATCH_ALL:
+		return SCTP_CHUNKMAP_IS_CLEAR(chunkmap);
+	case SCTP_CHUNK_MATCH_ANY:
+		return 0;
+	case SCTP_CHUNK_MATCH_ONLY:
+		return 1;
+	}
+
+	/* This will never be reached, but required to stop compiler whine */
+	return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_sctp_info *info;
+	sctp_sctphdr_t _sh, *sh;
+
+	info = (const struct ipt_sctp_info *)matchinfo;
+
+	if (offset) {
+		duprintf("Dropping non-first fragment.. FIXME\n");
+		return 0;
+	}
+	
+	sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh);
+	if (sh == NULL) {
+		duprintf("Dropping evil TCP offset=0 tinygram.\n");
+		*hotdrop = 1;
+		return 0;
+       	}
+	duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest));
+
+	return  SCCHECK(((ntohs(sh->source) >= info->spts[0]) 
+			&& (ntohs(sh->source) <= info->spts[1])), 
+		   	IPT_SCTP_SRC_PORTS, info->flags, info->invflags)
+		&& SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) 
+			&& (ntohs(sh->dest) <= info->dpts[1])), 
+			IPT_SCTP_DEST_PORTS, info->flags, info->invflags)
+		&& SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type,
+ 					info->flag_info, info->flag_count, 
+					hotdrop),
+			   IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags);
+}
+
+static int
+checkentry(const char *tablename,
+	   const struct ipt_ip *ip,
+	   void *matchinfo,
+	   unsigned int matchsize,
+	   unsigned int hook_mask)
+{
+	const struct ipt_sctp_info *info;
+
+	info = (const struct ipt_sctp_info *)matchinfo;
+
+	return ip->proto == IPPROTO_SCTP
+		&& !(ip->invflags & IPT_INV_PROTO)
+		&& matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info))
+		&& !(info->flags & ~IPT_SCTP_VALID_FLAGS)
+		&& !(info->invflags & ~IPT_SCTP_VALID_FLAGS)
+		&& !(info->invflags & ~info->flags)
+		&& ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || 
+			(info->chunk_match_type &
+				(SCTP_CHUNK_MATCH_ALL 
+				| SCTP_CHUNK_MATCH_ANY
+				| SCTP_CHUNK_MATCH_ONLY)));
+}
+
+static struct ipt_match sctp_match = 
+{ 
+	.list = { NULL, NULL},
+	.name = "sctp",
+	.match = &match,
+	.checkentry = &checkentry,
+	.destroy = NULL,
+	.me = THIS_MODULE
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&sctp_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&sctp_match);
+}
+
+module_init(init);
+module_exit(fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kiran Kumar Immidi");
+MODULE_DESCRIPTION("Match for SCTP protocol packets");
+
diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c
new file mode 100644
index 000000000000..b1511b97ea5f
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_state.c
@@ -0,0 +1,74 @@
+/* Kernel module to match connection tracking information. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter_ipv4/ip_conntrack.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_state.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>");
+MODULE_DESCRIPTION("iptables connection tracking state match module");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_state_info *sinfo = matchinfo;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	if (skb->nfct == &ip_conntrack_untracked.ct_general)
+		statebit = IPT_STATE_UNTRACKED;
+	else if (!ip_conntrack_get(skb, &ctinfo))
+		statebit = IPT_STATE_INVALID;
+	else
+		statebit = IPT_STATE_BIT(ctinfo);
+
+	return (sinfo->statemask & statebit);
+}
+
+static int check(const char *tablename,
+		 const struct ipt_ip *ip,
+		 void *matchinfo,
+		 unsigned int matchsize,
+		 unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match state_match = {
+	.name		= "state",
+	.match		= &match,
+	.checkentry	= &check,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	need_ip_conntrack();
+	return ipt_register_match(&state_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&state_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c
new file mode 100644
index 000000000000..4dc9b16ab4a3
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tcpmss.c
@@ -0,0 +1,127 @@
+/* Kernel module to match TCP MSS values. */
+
+/* Copyright (C) 2000 Marc Boucher <marc@mbsi.ca>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter_ipv4/ipt_tcpmss.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#define TH_SYN 0x02
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>");
+MODULE_DESCRIPTION("iptables TCP MSS match module");
+
+/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */
+static inline int
+mssoption_match(u_int16_t min, u_int16_t max,
+		const struct sk_buff *skb,
+		int invert,
+		int *hotdrop)
+{
+	struct tcphdr _tcph, *th;
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	u8 _opt[15 * 4 - sizeof(_tcph)], *op;
+	unsigned int i, optlen;
+
+	/* If we don't have the whole header, drop packet. */
+	th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
+				sizeof(_tcph), &_tcph);
+	if (th == NULL)
+		goto dropit;
+
+	/* Malformed. */
+	if (th->doff*4 < sizeof(*th))
+		goto dropit;
+
+	optlen = th->doff*4 - sizeof(*th);
+	if (!optlen)
+		goto out;
+
+	/* Truncated options. */
+	op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th),
+				optlen, _opt);
+	if (op == NULL)
+		goto dropit;
+
+	for (i = 0; i < optlen; ) {
+		if (op[i] == TCPOPT_MSS
+		    && (optlen - i) >= TCPOLEN_MSS
+		    && op[i+1] == TCPOLEN_MSS) {
+			u_int16_t mssval;
+
+			mssval = (op[i+2] << 8) | op[i+3];
+			
+			return (mssval >= min && mssval <= max) ^ invert;
+		}
+		if (op[i] < 2) i++;
+		else i += op[i+1]?:1;
+	}
+out:
+	return invert;
+
+ dropit:
+	*hotdrop = 1;
+	return 0;
+}
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_tcpmss_match_info *info = matchinfo;
+
+	return mssoption_match(info->mss_min, info->mss_max, skb,
+			       info->invert, hotdrop);
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info)))
+		return 0;
+
+	/* Must specify -p tcp */
+	if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) {
+		printk("tcpmss: Only works on TCP packets\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match tcpmss_match = {
+	.name		= "tcpmss",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&tcpmss_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&tcpmss_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c
new file mode 100644
index 000000000000..086a1bb61e3e
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_tos.c
@@ -0,0 +1,64 @@
+/* Kernel module to match TOS values. */
+
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_tos.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("iptables TOS match module");
+
+static int
+match(const struct sk_buff *skb,
+      const struct net_device *in,
+      const struct net_device *out,
+      const void *matchinfo,
+      int offset,
+      int *hotdrop)
+{
+	const struct ipt_tos_info *info = matchinfo;
+
+	return (skb->nh.iph->tos == info->tos) ^ info->invert;
+}
+
+static int
+checkentry(const char *tablename,
+           const struct ipt_ip *ip,
+           void *matchinfo,
+           unsigned int matchsize,
+           unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match tos_match = {
+	.name		= "tos",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&tos_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&tos_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c
new file mode 100644
index 000000000000..219aa9de88cc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_ttl.c
@@ -0,0 +1,79 @@
+/* IP tables module for matching the value of the TTL 
+ *
+ * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp
+ *
+ * (C) 2000,2001 by Harald Welte <laforge@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+
+#include <linux/netfilter_ipv4/ipt_ttl.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
+MODULE_DESCRIPTION("IP tables TTL matching module");
+MODULE_LICENSE("GPL");
+
+static int match(const struct sk_buff *skb, const struct net_device *in,
+		 const struct net_device *out, const void *matchinfo,
+		 int offset, int *hotdrop)
+{
+	const struct ipt_ttl_info *info = matchinfo;
+
+	switch (info->mode) {
+		case IPT_TTL_EQ:
+			return (skb->nh.iph->ttl == info->ttl);
+			break;
+		case IPT_TTL_NE:
+			return (!(skb->nh.iph->ttl == info->ttl));
+			break;
+		case IPT_TTL_LT:
+			return (skb->nh.iph->ttl < info->ttl);
+			break;
+		case IPT_TTL_GT:
+			return (skb->nh.iph->ttl > info->ttl);
+			break;
+		default:
+			printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", 
+				info->mode);
+			return 0;
+	}
+
+	return 0;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+		      void *matchinfo, unsigned int matchsize,
+		      unsigned int hook_mask)
+{
+	if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info)))
+		return 0;
+
+	return 1;
+}
+
+static struct ipt_match ttl_match = {
+	.name		= "ttl",
+	.match		= &match,
+	.checkentry	= &checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&ttl_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&ttl_match);
+
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
new file mode 100644
index 000000000000..260a4f0a2a90
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -0,0 +1,194 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables filter table");
+
+#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT))
+
+static struct
+{
+	struct ipt_replace repl;
+	struct ipt_standard entries[3];
+	struct ipt_error term;
+} initial_table __initdata 
+= { { "filter", FILTER_VALID_HOOKS, 4,
+      sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
+      { [NF_IP_LOCAL_IN] = 0,
+	[NF_IP_FORWARD] = sizeof(struct ipt_standard),
+	[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      { [NF_IP_LOCAL_IN] = 0,
+	[NF_IP_FORWARD] = sizeof(struct ipt_standard),
+	[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 },
+      0, NULL, { } },
+    {
+	    /* LOCAL_IN */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* FORWARD */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* LOCAL_OUT */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } }
+    },
+    /* ERROR */
+    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+	0,
+	sizeof(struct ipt_entry),
+	sizeof(struct ipt_error),
+	0, { 0, 0 }, { } },
+      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+	  { } },
+	"ERROR"
+      }
+    }
+};
+
+static struct ipt_table packet_filter = {
+	.name		= "filter",
+	.valid_hooks	= FILTER_VALID_HOOKS,
+	.lock		= RW_LOCK_UNLOCKED,
+	.me		= THIS_MODULE
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+	 struct sk_buff **pskb,
+	 const struct net_device *in,
+	 const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static unsigned int
+ipt_local_out_hook(unsigned int hook,
+		   struct sk_buff **pskb,
+		   const struct net_device *in,
+		   const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	/* root is playing with raw sockets. */
+	if ((*pskb)->len < sizeof(struct iphdr)
+	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+		if (net_ratelimit())
+			printk("ipt_hook: happy cracking.\n");
+		return NF_ACCEPT;
+	}
+
+	return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL);
+}
+
+static struct nf_hook_ops ipt_ops[] = {
+	{
+		.hook		= ipt_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_LOCAL_IN,
+		.priority	= NF_IP_PRI_FILTER,
+	},
+	{
+		.hook		= ipt_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_FORWARD,
+		.priority	= NF_IP_PRI_FILTER,
+	},
+	{
+		.hook		= ipt_local_out_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_LOCAL_OUT,
+		.priority	= NF_IP_PRI_FILTER,
+	},
+};
+
+/* Default to forward because I got too much mail already. */
+static int forward = NF_ACCEPT;
+module_param(forward, bool, 0000);
+
+static int __init init(void)
+{
+	int ret;
+
+	if (forward < 0 || forward > NF_MAX_VERDICT) {
+		printk("iptables forward must be 0 or 1\n");
+		return -EINVAL;
+	}
+
+	/* Entry 1 is the FORWARD hook */
+	initial_table.entries[1].target.verdict = -forward - 1;
+
+	/* Register table */
+	ret = ipt_register_table(&packet_filter, &initial_table.repl);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	ret = nf_register_hook(&ipt_ops[0]);
+	if (ret < 0)
+		goto cleanup_table;
+
+	ret = nf_register_hook(&ipt_ops[1]);
+	if (ret < 0)
+		goto cleanup_hook0;
+
+	ret = nf_register_hook(&ipt_ops[2]);
+	if (ret < 0)
+		goto cleanup_hook1;
+
+	return ret;
+
+ cleanup_hook1:
+	nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+	nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+	ipt_unregister_table(&packet_filter);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+		nf_unregister_hook(&ipt_ops[i]);
+
+	ipt_unregister_table(&packet_filter);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
new file mode 100644
index 000000000000..160eb11b6e2f
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -0,0 +1,260 @@
+/*
+ * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x.
+ *
+ * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling
+ * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Extended to all five netfilter hooks by Brad Chapman & Harald Welte
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/route.h>
+#include <linux/ip.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
+MODULE_DESCRIPTION("iptables mangle table");
+
+#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \
+			    (1 << NF_IP_LOCAL_IN) | \
+			    (1 << NF_IP_FORWARD) | \
+			    (1 << NF_IP_LOCAL_OUT) | \
+			    (1 << NF_IP_POST_ROUTING))
+
+/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
+static struct
+{
+	struct ipt_replace repl;
+	struct ipt_standard entries[5];
+	struct ipt_error term;
+} initial_table __initdata
+= { { "mangle", MANGLE_VALID_HOOKS, 6,
+      sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
+      { [NF_IP_PRE_ROUTING] 	= 0,
+	[NF_IP_LOCAL_IN] 	= sizeof(struct ipt_standard),
+	[NF_IP_FORWARD] 	= sizeof(struct ipt_standard) * 2,
+	[NF_IP_LOCAL_OUT] 	= sizeof(struct ipt_standard) * 3,
+	[NF_IP_POST_ROUTING] 	= sizeof(struct ipt_standard) * 4 },
+      { [NF_IP_PRE_ROUTING] 	= 0,
+	[NF_IP_LOCAL_IN] 	= sizeof(struct ipt_standard),
+	[NF_IP_FORWARD] 	= sizeof(struct ipt_standard) * 2,
+	[NF_IP_LOCAL_OUT] 	= sizeof(struct ipt_standard) * 3,
+	[NF_IP_POST_ROUTING]	= sizeof(struct ipt_standard) * 4 },
+      0, NULL, { } },
+    {
+	    /* PRE_ROUTING */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* LOCAL_IN */
+ 	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* FORWARD */
+ 	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* LOCAL_OUT */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+	    /* POST_ROUTING */
+	    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+		0,
+		sizeof(struct ipt_entry),
+		sizeof(struct ipt_standard),
+		0, { 0, 0 }, { } },
+	      { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } },
+		-NF_ACCEPT - 1 } },
+    },
+    /* ERROR */
+    { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 },
+	0,
+	sizeof(struct ipt_entry),
+	sizeof(struct ipt_error),
+	0, { 0, 0 }, { } },
+      { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } },
+	  { } },
+	"ERROR"
+      }
+    }
+};
+
+static struct ipt_table packet_mangler = {
+	.name		= "mangle",
+	.valid_hooks	= MANGLE_VALID_HOOKS,
+	.lock		= RW_LOCK_UNLOCKED,
+	.me		= THIS_MODULE,
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_route_hook(unsigned int hook,
+	 struct sk_buff **pskb,
+	 const struct net_device *in,
+	 const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+}
+
+static unsigned int
+ipt_local_hook(unsigned int hook,
+		   struct sk_buff **pskb,
+		   const struct net_device *in,
+		   const struct net_device *out,
+		   int (*okfn)(struct sk_buff *))
+{
+	unsigned int ret;
+	u_int8_t tos;
+	u_int32_t saddr, daddr;
+	unsigned long nfmark;
+
+	/* root is playing with raw sockets. */
+	if ((*pskb)->len < sizeof(struct iphdr)
+	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) {
+		if (net_ratelimit())
+			printk("ipt_hook: happy cracking.\n");
+		return NF_ACCEPT;
+	}
+
+	/* Save things which could affect route */
+	nfmark = (*pskb)->nfmark;
+	saddr = (*pskb)->nh.iph->saddr;
+	daddr = (*pskb)->nh.iph->daddr;
+	tos = (*pskb)->nh.iph->tos;
+
+	ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL);
+	/* Reroute for ANY change. */
+	if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE
+	    && ((*pskb)->nh.iph->saddr != saddr
+		|| (*pskb)->nh.iph->daddr != daddr
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		|| (*pskb)->nfmark != nfmark
+#endif
+		|| (*pskb)->nh.iph->tos != tos))
+		return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+
+	return ret;
+}
+
+static struct nf_hook_ops ipt_ops[] = {
+	{
+		.hook		= ipt_route_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_PRE_ROUTING, 
+		.priority	= NF_IP_PRI_MANGLE,
+	},
+	{
+		.hook		= ipt_route_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_LOCAL_IN,
+		.priority	= NF_IP_PRI_MANGLE,
+	},
+	{
+		.hook		= ipt_route_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_FORWARD,
+		.priority	= NF_IP_PRI_MANGLE,
+	},
+	{
+		.hook		= ipt_local_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_LOCAL_OUT,
+		.priority	= NF_IP_PRI_MANGLE,
+	},
+	{
+		.hook		= ipt_route_hook,
+		.owner		= THIS_MODULE,
+		.pf		= PF_INET,
+		.hooknum	= NF_IP_POST_ROUTING,
+		.priority	= NF_IP_PRI_MANGLE,
+	},
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	/* Register table */
+	ret = ipt_register_table(&packet_mangler, &initial_table.repl);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	ret = nf_register_hook(&ipt_ops[0]);
+	if (ret < 0)
+		goto cleanup_table;
+
+	ret = nf_register_hook(&ipt_ops[1]);
+	if (ret < 0)
+		goto cleanup_hook0;
+
+	ret = nf_register_hook(&ipt_ops[2]);
+	if (ret < 0)
+		goto cleanup_hook1;
+
+	ret = nf_register_hook(&ipt_ops[3]);
+	if (ret < 0)
+		goto cleanup_hook2;
+
+	ret = nf_register_hook(&ipt_ops[4]);
+	if (ret < 0)
+		goto cleanup_hook3;
+
+	return ret;
+
+ cleanup_hook3:
+        nf_unregister_hook(&ipt_ops[3]);
+ cleanup_hook2:
+        nf_unregister_hook(&ipt_ops[2]);
+ cleanup_hook1:
+	nf_unregister_hook(&ipt_ops[1]);
+ cleanup_hook0:
+	nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+	ipt_unregister_table(&packet_mangler);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+		nf_unregister_hook(&ipt_ops[i]);
+
+	ipt_unregister_table(&packet_mangler);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
new file mode 100644
index 000000000000..01b4a3c814d3
--- /dev/null
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -0,0 +1,156 @@
+/* 
+ * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT .
+ *
+ * Copyright (C) 2003 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ */
+#include <linux/module.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+
+#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))
+
+static struct
+{
+	struct ipt_replace repl;
+	struct ipt_standard entries[2];
+	struct ipt_error term;
+} initial_table __initdata = {
+	.repl = {
+		.name = "raw", 
+		.valid_hooks = RAW_VALID_HOOKS, 
+		.num_entries = 3,
+		.size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
+		.hook_entry = { 
+			[NF_IP_PRE_ROUTING] = 0,
+			[NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) },
+		.underflow = { 
+			[NF_IP_PRE_ROUTING] = 0,
+			[NF_IP_LOCAL_OUT]  = sizeof(struct ipt_standard) },
+	},
+	.entries = {
+	     /* PRE_ROUTING */
+	     { 
+		     .entry = { 
+			     .target_offset = sizeof(struct ipt_entry),
+			     .next_offset = sizeof(struct ipt_standard),
+		     },
+		     .target = { 
+			  .target = { 
+				  .u = {
+					  .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
+				  },
+			  },
+			  .verdict = -NF_ACCEPT - 1,
+		     },
+	     },
+
+	     /* LOCAL_OUT */
+	     {
+		     .entry = {
+			     .target_offset = sizeof(struct ipt_entry),
+			     .next_offset = sizeof(struct ipt_standard),
+		     },
+		     .target = {
+			     .target = {
+				     .u = {
+					     .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)),
+				     },
+			     },
+			     .verdict = -NF_ACCEPT - 1,
+		     },
+	     },
+	},
+	/* ERROR */
+	.term = {
+		.entry = {
+			.target_offset = sizeof(struct ipt_entry),
+			.next_offset = sizeof(struct ipt_error),
+		},
+		.target = {
+			.target = {
+				.u = {
+					.user = {
+						.target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), 
+						.name = IPT_ERROR_TARGET,
+					},
+				},
+			},
+			.errorname = "ERROR",
+		},
+	}
+};
+
+static struct ipt_table packet_raw = { 
+	.name = "raw", 
+	.valid_hooks =  RAW_VALID_HOOKS, 
+	.lock = RW_LOCK_UNLOCKED, 
+	.me = THIS_MODULE
+};
+
+/* The work comes in here from netfilter.c. */
+static unsigned int
+ipt_hook(unsigned int hook,
+	 struct sk_buff **pskb,
+	 const struct net_device *in,
+	 const struct net_device *out,
+	 int (*okfn)(struct sk_buff *))
+{
+	return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL);
+}
+
+/* 'raw' is the very first table. */
+static struct nf_hook_ops ipt_ops[] = {
+	{
+	  .hook = ipt_hook, 
+	  .pf = PF_INET, 
+	  .hooknum = NF_IP_PRE_ROUTING, 
+	  .priority = NF_IP_PRI_RAW
+	},
+	{
+	  .hook = ipt_hook, 
+	  .pf = PF_INET, 
+	  .hooknum = NF_IP_LOCAL_OUT, 
+	  .priority = NF_IP_PRI_RAW
+	},
+};
+
+static int __init init(void)
+{
+	int ret;
+
+	/* Register table */
+	ret = ipt_register_table(&packet_raw, &initial_table.repl);
+	if (ret < 0)
+		return ret;
+
+	/* Register hooks */
+	ret = nf_register_hook(&ipt_ops[0]);
+	if (ret < 0)
+		goto cleanup_table;
+
+	ret = nf_register_hook(&ipt_ops[1]);
+	if (ret < 0)
+		goto cleanup_hook0;
+
+	return ret;
+
+ cleanup_hook0:
+	nf_unregister_hook(&ipt_ops[0]);
+ cleanup_table:
+	ipt_unregister_table(&packet_raw);
+
+	return ret;
+}
+
+static void __exit fini(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++)
+		nf_unregister_hook(&ipt_ops[i]);
+
+	ipt_unregister_table(&packet_raw);
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
new file mode 100644
index 000000000000..912bbcc7f415
--- /dev/null
+++ b/net/ipv4/proc.c
@@ -0,0 +1,382 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		This file implements the various access functions for the
+ *		PROC file system.  It is mainly used for debugging and
+ *		statistics.
+ *
+ * Version:	$Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $
+ *
+ * Authors:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Gerald J. Heim, <heim@peanuts.informatik.uni-tuebingen.de>
+ *		Fred Baumgarten, <dc6iq@insu1.etec.uni-karlsruhe.de>
+ *		Erik Schoenfelder, <schoenfr@ibr.cs.tu-bs.de>
+ *
+ * Fixes:
+ *		Alan Cox	:	UDP sockets show the rxqueue/txqueue
+ *					using hint flag for the netinfo.
+ *	Pauline Middelink	:	identd support
+ *		Alan Cox	:	Make /proc safer.
+ *	Erik Schoenfelder	:	/proc/net/snmp
+ *		Alan Cox	:	Handle dead sockets properly.
+ *	Gerhard Koerting	:	Show both timers
+ *		Alan Cox	:	Allow inode to be NULL (kernel socket)
+ *	Andi Kleen		:	Add support for open_requests and
+ *					split functions for more readibility.
+ *	Andi Kleen		:	Add support for /proc/net/netstat
+ *	Arnaldo C. Melo		:	Convert to seq_file
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <net/icmp.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/raw.h>
+
+static int fold_prot_inuse(struct proto *proto)
+{
+	int res = 0;
+	int cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		res += proto->stats[cpu].inuse;
+
+	return res;
+}
+
+/*
+ *	Report socket allocation statistics [mea@utu.fi]
+ */
+static int sockstat_seq_show(struct seq_file *seq, void *v)
+{
+	/* From net/socket.c */
+	extern void socket_seq_show(struct seq_file *seq);
+
+	socket_seq_show(seq);
+	seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n",
+		   fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count),
+		   tcp_tw_count, atomic_read(&tcp_sockets_allocated),
+		   atomic_read(&tcp_memory_allocated));
+	seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot));
+	seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot));
+	seq_printf(seq,  "FRAG: inuse %d memory %d\n", ip_frag_nqueues,
+		   atomic_read(&ip_frag_mem));
+	return 0;
+}
+
+static int sockstat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, sockstat_seq_show, NULL);
+}
+
+static struct file_operations sockstat_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = sockstat_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+static unsigned long
+fold_field(void *mib[], int offt)
+{
+	unsigned long res = 0;
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt);
+		res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt);
+	}
+	return res;
+}
+
+/* snmp items */
+static struct snmp_mib snmp4_ipstats_list[] = {
+	SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES),
+	SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS),
+	SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS),
+	SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS),
+	SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS),
+	SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS),
+	SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS),
+	SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS),
+	SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS),
+	SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES),
+	SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT),
+	SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS),
+	SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS),
+	SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS),
+	SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS),
+	SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
+	SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
+	SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_icmp_list[] = {
+	SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS),
+	SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS),
+	SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS),
+	SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS),
+	SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS),
+	SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS),
+	SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS),
+	SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS),
+	SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS),
+	SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS),
+	SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS),
+	SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS),
+	SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS),
+	SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS),
+	SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS),
+	SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS),
+	SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS),
+	SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS),
+	SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS),
+	SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS),
+	SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS),
+	SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS),
+	SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS),
+	SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS),
+	SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS),
+	SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS),
+	SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_tcp_list[] = {
+	SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM),
+	SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN),
+	SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX),
+	SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN),
+	SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS),
+	SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS),
+	SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS),
+	SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS),
+	SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB),
+	SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS),
+	SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS),
+	SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS),
+	SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
+	SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
+	SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_udp_list[] = {
+	SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS),
+	SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS),
+	SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS),
+	SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS),
+	SNMP_MIB_SENTINEL
+};
+
+static struct snmp_mib snmp4_net_list[] = {
+	SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT),
+	SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV),
+	SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED),
+	SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS),
+	SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED),
+	SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED),
+	SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED),
+	SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS),
+	SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS),
+	SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER),
+	SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED),
+	SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED),
+	SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
+	SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED),
+	SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
+	SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+	SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
+	SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
+	SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
+	SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS),
+	SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS),
+	SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED),
+	SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG),
+	SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE),
+	SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED),
+	SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS),
+	SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER),
+	SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS),
+	SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS),
+	SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY),
+	SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY),
+	SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING),
+	SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER),
+	SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER),
+	SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER),
+	SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER),
+	SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO),
+	SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO),
+	SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO),
+	SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO),
+	SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS),
+	SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT),
+	SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES),
+	SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES),
+	SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES),
+	SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS),
+	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
+	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
+	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
+	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
+	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
+	SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
+	SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
+	SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
+	SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
+	SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV),
+	SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN),
+	SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA),
+	SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE),
+	SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY),
+	SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT),
+	SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER),
+	SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED),
+	SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES),
+	SNMP_MIB_SENTINEL
+};
+
+/*
+ *	Called from the PROCfs module. This outputs /proc/net/snmp.
+ */
+static int snmp_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_puts(seq, "Ip: Forwarding DefaultTTL");
+
+	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
+
+	seq_printf(seq, "\nIp: %d %d",
+			ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl);
+
+	for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   fold_field((void **) ip_statistics, 
+				      snmp4_ipstats_list[i].entry));
+
+	seq_puts(seq, "\nIcmp:");
+	for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_icmp_list[i].name);
+
+	seq_puts(seq, "\nIcmp:");
+	for (i = 0; snmp4_icmp_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   fold_field((void **) icmp_statistics, 
+				      snmp4_icmp_list[i].entry));
+
+	seq_puts(seq, "\nTcp:");
+	for (i = 0; snmp4_tcp_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_tcp_list[i].name);
+
+	seq_puts(seq, "\nTcp:");
+	for (i = 0; snmp4_tcp_list[i].name != NULL; i++) {
+		/* MaxConn field is signed, RFC 2012 */
+		if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
+			seq_printf(seq, " %ld",
+				   fold_field((void **) tcp_statistics, 
+					      snmp4_tcp_list[i].entry));
+		else
+			seq_printf(seq, " %lu",
+				   fold_field((void **) tcp_statistics,
+					      snmp4_tcp_list[i].entry));
+	}
+
+	seq_puts(seq, "\nUdp:");
+	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_udp_list[i].name);
+
+	seq_puts(seq, "\nUdp:");
+	for (i = 0; snmp4_udp_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   fold_field((void **) udp_statistics, 
+				      snmp4_udp_list[i].entry));
+
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int snmp_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, snmp_seq_show, NULL);
+}
+
+static struct file_operations snmp_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = snmp_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ *	Output /proc/net/netstat
+ */
+static int netstat_seq_show(struct seq_file *seq, void *v)
+{
+	int i;
+
+	seq_puts(seq, "TcpExt:");
+	for (i = 0; snmp4_net_list[i].name != NULL; i++)
+		seq_printf(seq, " %s", snmp4_net_list[i].name);
+
+	seq_puts(seq, "\nTcpExt:");
+	for (i = 0; snmp4_net_list[i].name != NULL; i++)
+		seq_printf(seq, " %lu",
+			   fold_field((void **) net_statistics, 
+				      snmp4_net_list[i].entry));
+
+	seq_putc(seq, '\n');
+	return 0;
+}
+
+static int netstat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, netstat_seq_show, NULL);
+}
+
+static struct file_operations netstat_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = netstat_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = single_release,
+};
+
+int __init ip_misc_proc_init(void)
+{
+	int rc = 0;
+
+	if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops))
+		goto out_netstat;
+
+	if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops))
+		goto out_snmp;
+
+	if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops))
+		goto out_sockstat;
+out:
+	return rc;
+out_sockstat:
+	proc_net_remove("snmp");
+out_snmp:
+	proc_net_remove("netstat");
+out_netstat:
+	rc = -ENOMEM;
+	goto out;
+}
+
diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
new file mode 100644
index 000000000000..90a587cacaa4
--- /dev/null
+++ b/net/ipv4/protocol.c
@@ -0,0 +1,101 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		INET protocol dispatch tables.
+ *
+ * Version:	$Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	: Ahah! udp icmp errors don't work because
+ *				  udp_err is never called!
+ *		Alan Cox	: Added new fields for init and ready for
+ *				  proper fragmentation (_NO_ 4K limits!)
+ *		Richard Colella	: Hang on hash collision
+ *		Vince Laviano	: Modified inet_del_protocol() to correctly
+ *				  maintain copy bit.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/string.h>
+#include <linux/config.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/timer.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/tcp.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/ipip.h>
+#include <linux/igmp.h>
+
+struct net_protocol *inet_protos[MAX_INET_PROTOS];
+static DEFINE_SPINLOCK(inet_proto_lock);
+
+/*
+ *	Add a protocol handler to the hash tables
+ */
+
+int inet_add_protocol(struct net_protocol *prot, unsigned char protocol)
+{
+	int hash, ret;
+
+	hash = protocol & (MAX_INET_PROTOS - 1);
+
+	spin_lock_bh(&inet_proto_lock);
+	if (inet_protos[hash]) {
+		ret = -1;
+	} else {
+		inet_protos[hash] = prot;
+		ret = 0;
+	}
+	spin_unlock_bh(&inet_proto_lock);
+
+	return ret;
+}
+
+/*
+ *	Remove a protocol from the hash tables.
+ */
+ 
+int inet_del_protocol(struct net_protocol *prot, unsigned char protocol)
+{
+	int hash, ret;
+
+	hash = protocol & (MAX_INET_PROTOS - 1);
+
+	spin_lock_bh(&inet_proto_lock);
+	if (inet_protos[hash] == prot) {
+		inet_protos[hash] = NULL;
+		ret = 0;
+	} else {
+		ret = -1;
+	}
+	spin_unlock_bh(&inet_proto_lock);
+
+	synchronize_net();
+
+	return ret;
+}
+
+EXPORT_SYMBOL(inet_add_protocol);
+EXPORT_SYMBOL(inet_del_protocol);
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
new file mode 100644
index 000000000000..93624a32eb9a
--- /dev/null
+++ b/net/ipv4/raw.c
@@ -0,0 +1,888 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		RAW - implementation of IP "raw" sockets.
+ *
+ * Version:	$Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() fixed up
+ *		Alan Cox	:	ICMP error handling
+ *		Alan Cox	:	EMSGSIZE if you send too big a packet
+ *		Alan Cox	: 	Now uses generic datagrams and shared
+ *					skbuff library. No more peek crashes,
+ *					no more backlogs
+ *		Alan Cox	:	Checks sk->broadcast.
+ *		Alan Cox	:	Uses skb_free_datagram/skb_copy_datagram
+ *		Alan Cox	:	Raw passes ip options too
+ *		Alan Cox	:	Setsocketopt added
+ *		Alan Cox	:	Fixed error return for broadcasts
+ *		Alan Cox	:	Removed wake_up calls
+ *		Alan Cox	:	Use ttl/tos
+ *		Alan Cox	:	Cleaned up old debugging
+ *		Alan Cox	:	Use new kernel side addresses
+ *	Arnt Gulbrandsen	:	Fixed MSG_DONTROUTE in raw sockets.
+ *		Alan Cox	:	BSD style RAW socket demultiplexing.
+ *		Alan Cox	:	Beginnings of mrouted support.
+ *		Alan Cox	:	Added IP_HDRINCL option.
+ *		Alan Cox	:	Skip broadcast check if BSDism set.
+ *		David S. Miller	:	New socket lookup architecture.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+ 
+#include <linux/config.h> 
+#include <asm/atomic.h>
+#include <asm/byteorder.h>
+#include <asm/current.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/aio.h>
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/sockios.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/mroute.h>
+#include <linux/netdevice.h>
+#include <linux/in_route.h>
+#include <linux/route.h>
+#include <linux/tcp.h>
+#include <linux/skbuff.h>
+#include <net/dst.h>
+#include <net/sock.h>
+#include <linux/gfp.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/udp.h>
+#include <net/raw.h>
+#include <net/snmp.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+#include <linux/rtnetlink.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE];
+DEFINE_RWLOCK(raw_v4_lock);
+
+static void raw_v4_hash(struct sock *sk)
+{
+	struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num &
+						 (RAWV4_HTABLE_SIZE - 1)];
+
+	write_lock_bh(&raw_v4_lock);
+	sk_add_node(sk, head);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock_bh(&raw_v4_lock);
+}
+
+static void raw_v4_unhash(struct sock *sk)
+{
+ 	write_lock_bh(&raw_v4_lock);
+	if (sk_del_node_init(sk))
+		sock_prot_dec_use(sk->sk_prot);
+	write_unlock_bh(&raw_v4_lock);
+}
+
+struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num,
+			     unsigned long raddr, unsigned long laddr,
+			     int dif)
+{
+	struct hlist_node *node;
+
+	sk_for_each_from(sk, node) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (inet->num == num 					&&
+		    !(inet->daddr && inet->daddr != raddr) 		&&
+		    !(inet->rcv_saddr && inet->rcv_saddr != laddr)	&&
+		    !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+			goto found; /* gotcha */
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+/*
+ *	0 - deliver
+ *	1 - block
+ */
+static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb)
+{
+	int type;
+
+	if (!pskb_may_pull(skb, sizeof(struct icmphdr)))
+		return 1;
+
+	type = skb->h.icmph->type;
+	if (type < 32) {
+		__u32 data = raw_sk(sk)->filter.data;
+
+		return ((1 << type) & data) != 0;
+	}
+
+	/* Do not block unknown ICMP types */
+	return 0;
+}
+
+/* IP input processing comes here for RAW socket delivery.
+ * Caller owns SKB, so we must make clones.
+ *
+ * RFC 1122: SHOULD pass TOS value up to the transport layer.
+ * -> It does. And not only TOS, but all IP header.
+ */
+void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash)
+{
+	struct sock *sk;
+	struct hlist_head *head;
+
+	read_lock(&raw_v4_lock);
+	head = &raw_v4_htable[hash];
+	if (hlist_empty(head))
+		goto out;
+	sk = __raw_v4_lookup(__sk_head(head), iph->protocol,
+			     iph->saddr, iph->daddr,
+			     skb->dev->ifindex);
+
+	while (sk) {
+		if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) {
+			struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC);
+
+			/* Not releasing hash table! */
+			if (clone)
+				raw_rcv(sk, clone);
+		}
+		sk = __raw_v4_lookup(sk_next(sk), iph->protocol,
+				     iph->saddr, iph->daddr,
+				     skb->dev->ifindex);
+	}
+out:
+	read_unlock(&raw_v4_lock);
+}
+
+void raw_err (struct sock *sk, struct sk_buff *skb, u32 info)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	int err = 0;
+	int harderr = 0;
+
+	/* Report error on raw socket, if:
+	   1. User requested ip_recverr.
+	   2. Socket is connected (otherwise the error indication
+	      is useless without ip_recverr and error is hard.
+	 */
+	if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED)
+		return;
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		return;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		err = EHOSTUNREACH;
+		if (code > NR_ICMP_UNREACH)
+			break;
+		err = icmp_err_convert[code].errno;
+		harderr = icmp_err_convert[code].fatal;
+		if (code == ICMP_FRAG_NEEDED) {
+			harderr = inet->pmtudisc != IP_PMTUDISC_DONT;
+			err = EMSGSIZE;
+		}
+	}
+
+	if (inet->recverr) {
+		struct iphdr *iph = (struct iphdr*)skb->data;
+		u8 *payload = skb->data + (iph->ihl << 2);
+
+		if (inet->hdrincl)
+			payload = skb->data;
+		ip_icmp_error(sk, skb, err, 0, info, payload);
+	}
+
+	if (inet->recverr || harderr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	}
+}
+
+static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
+{
+	/* Charge it to the socket. */
+	
+	if (sock_queue_rcv_skb(sk, skb) < 0) {
+		/* FIXME: increment a raw drops counter here */
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	return NET_RX_SUCCESS;
+}
+
+int raw_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		kfree_skb(skb);
+		return NET_RX_DROP;
+	}
+
+	skb_push(skb, skb->data - skb->nh.raw);
+
+	raw_rcv_skb(sk, skb);
+	return 0;
+}
+
+static int raw_send_hdrinc(struct sock *sk, void *from, int length,
+			struct rtable *rt, 
+			unsigned int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int hh_len;
+	struct iphdr *iph;
+	struct sk_buff *skb;
+	int err;
+
+	if (length > rt->u.dst.dev->mtu) {
+		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+			       rt->u.dst.dev->mtu);
+		return -EMSGSIZE;
+	}
+	if (flags&MSG_PROBE)
+		goto out;
+
+	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
+
+	skb = sock_alloc_send_skb(sk, length+hh_len+15,
+				  flags&MSG_DONTWAIT, &err);
+	if (skb == NULL)
+		goto error; 
+	skb_reserve(skb, hh_len);
+
+	skb->priority = sk->sk_priority;
+	skb->dst = dst_clone(&rt->u.dst);
+
+	skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length);
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	skb->h.raw = skb->nh.raw;
+	err = memcpy_fromiovecend((void *)iph, from, 0, length);
+	if (err)
+		goto error_fault;
+
+	/* We don't modify invalid header */
+	if (length >= sizeof(*iph) && iph->ihl * 4 <= length) {
+		if (!iph->saddr)
+			iph->saddr = rt->rt_src;
+		iph->check   = 0;
+		iph->tot_len = htons(length);
+		if (!iph->id)
+			ip_select_ident(iph, &rt->u.dst, NULL);
+
+		iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+	}
+
+	err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev,
+		      dst_output);
+	if (err > 0)
+		err = inet->recverr ? net_xmit_errno(err) : 0;
+	if (err)
+		goto error;
+out:
+	return 0;
+
+error_fault:
+	err = -EFAULT;
+	kfree_skb(skb);
+error:
+	IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
+	return err; 
+}
+
+static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg)
+{
+	struct iovec *iov;
+	u8 __user *type = NULL;
+	u8 __user *code = NULL;
+	int probed = 0;
+	int i;
+
+	if (!msg->msg_iov)
+		return;
+
+	for (i = 0; i < msg->msg_iovlen; i++) {
+		iov = &msg->msg_iov[i];
+		if (!iov)
+			continue;
+
+		switch (fl->proto) {
+		case IPPROTO_ICMP:
+			/* check if one-byte field is readable or not. */
+			if (iov->iov_base && iov->iov_len < 1)
+				break;
+
+			if (!type) {
+				type = iov->iov_base;
+				/* check if code field is readable or not. */
+				if (iov->iov_len > 1)
+					code = type + 1;
+			} else if (!code)
+				code = iov->iov_base;
+
+			if (type && code) {
+				get_user(fl->fl_icmp_type, type);
+				__get_user(fl->fl_icmp_code, code);
+				probed = 1;
+			}
+			break;
+		default:
+			probed = 1;
+			break;
+		}
+		if (probed)
+			break;
+	}
+}
+
+static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct ipcm_cookie ipc;
+	struct rtable *rt = NULL;
+	int free = 0;
+	u32 daddr;
+	u32 saddr;
+	u8  tos;
+	int err;
+
+	err = -EMSGSIZE;
+	if (len < 0 || len > 0xFFFF)
+		goto out;
+
+	/*
+	 *	Check the flags.
+	 */
+
+	err = -EOPNOTSUPP;
+	if (msg->msg_flags & MSG_OOB)	/* Mirror BSD error message */
+		goto out;               /* compatibility */
+			 
+	/*
+	 *	Get and verify the address. 
+	 */
+
+	if (msg->msg_namelen) {
+		struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name;
+		err = -EINVAL;
+		if (msg->msg_namelen < sizeof(*usin))
+			goto out;
+		if (usin->sin_family != AF_INET) {
+			static int complained;
+			if (!complained++)
+				printk(KERN_INFO "%s forgot to set AF_INET in "
+						 "raw sendmsg. Fix it!\n",
+						 current->comm);
+			err = -EAFNOSUPPORT;
+			if (usin->sin_family)
+				goto out;
+		}
+		daddr = usin->sin_addr.s_addr;
+		/* ANK: I did not forget to get protocol from port field.
+		 * I just do not know, who uses this weirdness.
+		 * IP_HDRINCL is much more convenient.
+		 */
+	} else {
+		err = -EDESTADDRREQ;
+		if (sk->sk_state != TCP_ESTABLISHED) 
+			goto out;
+		daddr = inet->daddr;
+	}
+
+	ipc.addr = inet->saddr;
+	ipc.opt = NULL;
+	ipc.oif = sk->sk_bound_dev_if;
+
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(msg, &ipc);
+		if (err)
+			goto out;
+		if (ipc.opt)
+			free = 1;
+	}
+
+	saddr = ipc.addr;
+	ipc.addr = daddr;
+
+	if (!ipc.opt)
+		ipc.opt = inet->opt;
+
+	if (ipc.opt) {
+		err = -EINVAL;
+		/* Linux does not mangle headers on raw sockets,
+		 * so that IP options + IP_HDRINCL is non-sense.
+		 */
+		if (inet->hdrincl)
+			goto done;
+		if (ipc.opt->srr) {
+			if (!daddr)
+				goto done;
+			daddr = ipc.opt->faddr;
+		}
+	}
+	tos = RT_CONN_FLAGS(sk);
+	if (msg->msg_flags & MSG_DONTROUTE)
+		tos |= RTO_ONLINK;
+
+	if (MULTICAST(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+	}
+
+	{
+		struct flowi fl = { .oif = ipc.oif,
+				    .nl_u = { .ip4_u =
+					      { .daddr = daddr,
+						.saddr = saddr,
+						.tos = tos } },
+				    .proto = inet->hdrincl ? IPPROTO_RAW :
+					    		     sk->sk_protocol,
+				  };
+		if (!inet->hdrincl)
+			raw_probe_proto_opt(&fl, msg);
+
+		err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+	}
+	if (err)
+		goto done;
+
+	err = -EACCES;
+	if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST))
+		goto done;
+
+	if (msg->msg_flags & MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	if (inet->hdrincl)
+		err = raw_send_hdrinc(sk, msg->msg_iov, len, 
+					rt, msg->msg_flags);
+	
+	 else {
+		if (!ipc.addr)
+			ipc.addr = rt->rt_dst;
+		lock_sock(sk);
+		err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0,
+					&ipc, rt, msg->msg_flags);
+		if (err)
+			ip_flush_pending_frames(sk);
+		else if (!(msg->msg_flags & MSG_MORE))
+			err = ip_push_pending_frames(sk);
+		release_sock(sk);
+	}
+done:
+	if (free)
+		kfree(ipc.opt);
+	ip_rt_put(rt);
+
+out:	return err < 0 ? err : len;
+
+do_confirm:
+	dst_confirm(&rt->u.dst);
+	if (!(msg->msg_flags & MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto done;
+}
+
+static void raw_close(struct sock *sk, long timeout)
+{
+        /*
+	 * Raw sockets may have direct kernel refereneces. Kill them.
+	 */
+	ip_ra_control(sk, 0, NULL);
+
+	sk_common_release(sk);
+}
+
+/* This gets rid of all the nasties in af_inet. -DaveM */
+static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
+	int ret = -EINVAL;
+	int chk_addr_ret;
+
+	if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
+		goto out;
+	chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+	ret = -EADDRNOTAVAIL;
+	if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
+	    chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
+		goto out;
+	inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
+		inet->saddr = 0;  /* Use device */
+	sk_dst_reset(sk);
+	ret = 0;
+out:	return ret;
+}
+
+/*
+ *	This should be easy, if there is something there
+ *	we return it, otherwise we block.
+ */
+
+static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	size_t copied = 0;
+	int err = -EOPNOTSUPP;
+	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+	struct sk_buff *skb;
+
+	if (flags & MSG_OOB)
+		goto out;
+
+	if (addr_len)
+		*addr_len = sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE) {
+		err = ip_recv_error(sk, msg, len);
+		goto out;
+	}
+
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+
+	copied = skb->len;
+	if (len < copied) {
+		msg->msg_flags |= MSG_TRUNC;
+		copied = len;
+	}
+
+	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	if (err)
+		goto done;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin) {
+		sin->sin_family = AF_INET;
+		sin->sin_addr.s_addr = skb->nh.iph->saddr;
+		memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
+	}
+	if (inet->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+	if (flags & MSG_TRUNC)
+		copied = skb->len;
+done:
+	skb_free_datagram(sk, skb);
+out:	return err ? err : copied;
+}
+
+static int raw_init(struct sock *sk)
+{
+	struct raw_sock *rp = raw_sk(sk);
+
+	if (inet_sk(sk)->num == IPPROTO_ICMP)
+		memset(&rp->filter, 0, sizeof(rp->filter));
+	return 0;
+}
+
+static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen)
+{
+	if (optlen > sizeof(struct icmp_filter))
+		optlen = sizeof(struct icmp_filter);
+	if (copy_from_user(&raw_sk(sk)->filter, optval, optlen))
+		return -EFAULT;
+	return 0;
+}
+
+static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen)
+{
+	int len, ret = -EFAULT;
+
+	if (get_user(len, optlen))
+		goto out;
+	ret = -EINVAL;
+	if (len < 0)
+		goto out;
+	if (len > sizeof(struct icmp_filter))
+		len = sizeof(struct icmp_filter);
+	ret = -EFAULT;
+	if (put_user(len, optlen) ||
+	    copy_to_user(optval, &raw_sk(sk)->filter, len))
+		goto out;
+	ret = 0;
+out:	return ret;
+}
+
+static int raw_setsockopt(struct sock *sk, int level, int optname, 
+			  char __user *optval, int optlen)
+{
+	if (level != SOL_RAW)
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+
+	if (optname == ICMP_FILTER) {
+		if (inet_sk(sk)->num != IPPROTO_ICMP)
+			return -EOPNOTSUPP;
+		else
+			return raw_seticmpfilter(sk, optval, optlen);
+	}
+	return -ENOPROTOOPT;
+}
+
+static int raw_getsockopt(struct sock *sk, int level, int optname, 
+			  char __user *optval, int __user *optlen)
+{
+	if (level != SOL_RAW)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+
+	if (optname == ICMP_FILTER) {
+		if (inet_sk(sk)->num != IPPROTO_ICMP)
+			return -EOPNOTSUPP;
+		else
+			return raw_geticmpfilter(sk, optval, optlen);
+	}
+	return -ENOPROTOOPT;
+}
+
+static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	switch (cmd) {
+		case SIOCOUTQ: {
+			int amount = atomic_read(&sk->sk_wmem_alloc);
+			return put_user(amount, (int __user *)arg);
+		}
+		case SIOCINQ: {
+			struct sk_buff *skb;
+			int amount = 0;
+
+			spin_lock_irq(&sk->sk_receive_queue.lock);
+			skb = skb_peek(&sk->sk_receive_queue);
+			if (skb != NULL)
+				amount = skb->len;
+			spin_unlock_irq(&sk->sk_receive_queue.lock);
+			return put_user(amount, (int __user *)arg);
+		}
+
+		default:
+#ifdef CONFIG_IP_MROUTE
+			return ipmr_ioctl(sk, cmd, (void __user *)arg);
+#else
+			return -ENOIOCTLCMD;
+#endif
+	}
+}
+
+struct proto raw_prot = {
+	.name =		"RAW",
+	.owner =	THIS_MODULE,
+	.close =	raw_close,
+	.connect =	ip4_datagram_connect,
+	.disconnect =	udp_disconnect,
+	.ioctl =	raw_ioctl,
+	.init =		raw_init,
+	.setsockopt =	raw_setsockopt,
+	.getsockopt =	raw_getsockopt,
+	.sendmsg =	raw_sendmsg,
+	.recvmsg =	raw_recvmsg,
+	.bind =		raw_bind,
+	.backlog_rcv =	raw_rcv_skb,
+	.hash =		raw_v4_hash,
+	.unhash =	raw_v4_unhash,
+	.obj_size =	sizeof(struct raw_sock),
+};
+
+#ifdef CONFIG_PROC_FS
+struct raw_iter_state {
+	int bucket;
+};
+
+#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private)
+
+static struct sock *raw_get_first(struct seq_file *seq)
+{
+	struct sock *sk;
+	struct raw_iter_state* state = raw_seq_private(seq);
+
+	for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) {
+		struct hlist_node *node;
+
+		sk_for_each(sk, node, &raw_v4_htable[state->bucket])
+			if (sk->sk_family == PF_INET)
+				goto found;
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct raw_iter_state* state = raw_seq_private(seq);
+
+	do {
+		sk = sk_next(sk);
+try_again:
+		;
+	} while (sk && sk->sk_family != PF_INET);
+
+	if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) {
+		sk = sk_head(&raw_v4_htable[state->bucket]);
+		goto try_again;
+	}
+	return sk;
+}
+
+static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = raw_get_first(seq);
+
+	if (sk)
+		while (pos && (sk = raw_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+static void *raw_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock(&raw_v4_lock);
+	return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == SEQ_START_TOKEN)
+		sk = raw_get_first(seq);
+	else
+		sk = raw_get_next(seq, v);
+	++*pos;
+	return sk;
+}
+
+static void raw_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock(&raw_v4_lock);
+}
+
+static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	unsigned int dest = inet->daddr,
+		     src = inet->rcv_saddr;
+	__u16 destp = 0,
+	      srcp  = inet->num;
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+		i, src, srcp, dest, destp, sp->sk_state, 
+		atomic_read(&sp->sk_wmem_alloc),
+		atomic_read(&sp->sk_rmem_alloc),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp);
+	return tmpbuf;
+}
+
+static int raw_seq_show(struct seq_file *seq, void *v)
+{
+	char tmpbuf[129];
+
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			       "  sl  local_address rem_address   st tx_queue "
+			       "rx_queue tr tm->when retrnsmt   uid  timeout "
+			       "inode");
+	else {
+		struct raw_iter_state *state = raw_seq_private(seq);
+
+		seq_printf(seq, "%-127s\n",
+			   get_raw_sock(v, tmpbuf, state->bucket));
+	}
+	return 0;
+}
+
+static struct seq_operations raw_seq_ops = {
+	.start = raw_seq_start,
+	.next  = raw_seq_next,
+	.stop  = raw_seq_stop,
+	.show  = raw_seq_show,
+};
+
+static int raw_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		goto out;
+	rc = seq_open(file, &raw_seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq = file->private_data;
+	seq->private = s;
+	memset(s, 0, sizeof(*s));
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations raw_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = raw_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+int __init raw_proc_init(void)
+{
+	if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops))
+		return -ENOMEM;
+	return 0;
+}
+
+void __init raw_proc_exit(void)
+{
+	proc_net_remove("raw");
+}
+#endif /* CONFIG_PROC_FS */
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
new file mode 100644
index 000000000000..9f91a116d919
--- /dev/null
+++ b/net/ipv4/route.c
@@ -0,0 +1,3177 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		ROUTE - implementation of the IP router.
+ *
+ * Version:	$Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Linus Torvalds, <Linus.Torvalds@helsinki.fi>
+ *		Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ * Fixes:
+ *		Alan Cox	:	Verify area fixes.
+ *		Alan Cox	:	cli() protects routing changes
+ *		Rui Oliveira	:	ICMP routing table updates
+ *		(rco@di.uminho.pt)	Routing table insertion and update
+ *		Linus Torvalds	:	Rewrote bits to be sensible
+ *		Alan Cox	:	Added BSD route gw semantics
+ *		Alan Cox	:	Super /proc >4K 
+ *		Alan Cox	:	MTU in route table
+ *		Alan Cox	: 	MSS actually. Also added the window
+ *					clamper.
+ *		Sam Lantinga	:	Fixed route matching in rt_del()
+ *		Alan Cox	:	Routing cache support.
+ *		Alan Cox	:	Removed compatibility cruft.
+ *		Alan Cox	:	RTF_REJECT support.
+ *		Alan Cox	:	TCP irtt support.
+ *		Jonathan Naylor	:	Added Metric support.
+ *	Miquel van Smoorenburg	:	BSD API fixes.
+ *	Miquel van Smoorenburg	:	Metrics.
+ *		Alan Cox	:	Use __u32 properly
+ *		Alan Cox	:	Aligned routing errors more closely with BSD
+ *					our system is still very different.
+ *		Alan Cox	:	Faster /proc handling
+ *	Alexey Kuznetsov	:	Massive rework to support tree based routing,
+ *					routing caches and better behaviour.
+ *		
+ *		Olaf Erb	:	irtt wasn't being copied right.
+ *		Bjorn Ekwall	:	Kerneld route support.
+ *		Alan Cox	:	Multicast fixed (I hope)
+ * 		Pavel Krauz	:	Limited broadcast fixed
+ *		Mike McLagan	:	Routing by source
+ *	Alexey Kuznetsov	:	End of old history. Split to fib.c and
+ *					route.c and rewritten from scratch.
+ *		Andi Kleen	:	Load-limit warning messages.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *	Vitaly E. Lavrov	:	Race condition in ip_route_input_slow.
+ *	Tobias Ringstrom	:	Uninitialized res.type in ip_route_output_slow.
+ *	Vladimir V. Ivanov	:	IP rule info (flowid) is really useful.
+ *		Marc Boucher	:	routing by fwmark
+ *	Robert Olsson		:	Added rt_cache statistics
+ *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/errno.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/igmp.h>
+#include <linux/pkt_sched.h>
+#include <linux/mroute.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/random.h>
+#include <linux/jhash.h>
+#include <linux/rcupdate.h>
+#include <linux/times.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/inetpeer.h>
+#include <net/sock.h>
+#include <net/ip_fib.h>
+#include <net/arp.h>
+#include <net/tcp.h>
+#include <net/icmp.h>
+#include <net/xfrm.h>
+#include <net/ip_mp_alg.h>
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define RT_FL_TOS(oldflp) \
+    ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
+
+#define IP_MAX_MTU	0xFFF0
+
+#define RT_GC_TIMEOUT (300*HZ)
+
+static int ip_rt_min_delay		= 2 * HZ;
+static int ip_rt_max_delay		= 10 * HZ;
+static int ip_rt_max_size;
+static int ip_rt_gc_timeout		= RT_GC_TIMEOUT;
+static int ip_rt_gc_interval		= 60 * HZ;
+static int ip_rt_gc_min_interval	= HZ / 2;
+static int ip_rt_redirect_number	= 9;
+static int ip_rt_redirect_load		= HZ / 50;
+static int ip_rt_redirect_silence	= ((HZ / 50) << (9 + 1));
+static int ip_rt_error_cost		= HZ;
+static int ip_rt_error_burst		= 5 * HZ;
+static int ip_rt_gc_elasticity		= 8;
+static int ip_rt_mtu_expires		= 10 * 60 * HZ;
+static int ip_rt_min_pmtu		= 512 + 20 + 20;
+static int ip_rt_min_advmss		= 256;
+static int ip_rt_secret_interval	= 10 * 60 * HZ;
+static unsigned long rt_deadline;
+
+#define RTprint(a...)	printk(KERN_DEBUG a)
+
+static struct timer_list rt_flush_timer;
+static struct timer_list rt_periodic_timer;
+static struct timer_list rt_secret_timer;
+
+/*
+ *	Interface to generic destination cache.
+ */
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
+static void		 ipv4_dst_destroy(struct dst_entry *dst);
+static void		 ipv4_dst_ifdown(struct dst_entry *dst,
+					 struct net_device *dev, int how);
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
+static void		 ipv4_link_failure(struct sk_buff *skb);
+static void		 ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
+static int rt_garbage_collect(void);
+
+
+static struct dst_ops ipv4_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		__constant_htons(ETH_P_IP),
+	.gc =			rt_garbage_collect,
+	.check =		ipv4_dst_check,
+	.destroy =		ipv4_dst_destroy,
+	.ifdown =		ipv4_dst_ifdown,
+	.negative_advice =	ipv4_negative_advice,
+	.link_failure =		ipv4_link_failure,
+	.update_pmtu =		ip_rt_update_pmtu,
+	.entry_size =		sizeof(struct rtable),
+};
+
+#define ECN_OR_COST(class)	TC_PRIO_##class
+
+__u8 ip_tos2prio[16] = {
+	TC_PRIO_BESTEFFORT,
+	ECN_OR_COST(FILLER),
+	TC_PRIO_BESTEFFORT,
+	ECN_OR_COST(BESTEFFORT),
+	TC_PRIO_BULK,
+	ECN_OR_COST(BULK),
+	TC_PRIO_BULK,
+	ECN_OR_COST(BULK),
+	TC_PRIO_INTERACTIVE,
+	ECN_OR_COST(INTERACTIVE),
+	TC_PRIO_INTERACTIVE,
+	ECN_OR_COST(INTERACTIVE),
+	TC_PRIO_INTERACTIVE_BULK,
+	ECN_OR_COST(INTERACTIVE_BULK),
+	TC_PRIO_INTERACTIVE_BULK,
+	ECN_OR_COST(INTERACTIVE_BULK)
+};
+
+
+/*
+ * Route cache.
+ */
+
+/* The locking scheme is rather straight forward:
+ *
+ * 1) Read-Copy Update protects the buckets of the central route hash.
+ * 2) Only writers remove entries, and they hold the lock
+ *    as they look at rtable reference counts.
+ * 3) Only readers acquire references to rtable entries,
+ *    they do so with atomic increments and with the
+ *    lock held.
+ */
+
+struct rt_hash_bucket {
+	struct rtable	*chain;
+	spinlock_t	lock;
+} __attribute__((__aligned__(8)));
+
+static struct rt_hash_bucket 	*rt_hash_table;
+static unsigned			rt_hash_mask;
+static int			rt_hash_log;
+static unsigned int		rt_hash_rnd;
+
+struct rt_cache_stat *rt_cache_stat;
+
+static int rt_intern_hash(unsigned hash, struct rtable *rth,
+				struct rtable **res);
+
+static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+{
+	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
+		& rt_hash_mask);
+}
+
+#ifdef CONFIG_PROC_FS
+struct rt_cache_iter_state {
+	int bucket;
+};
+
+static struct rtable *rt_cache_get_first(struct seq_file *seq)
+{
+	struct rtable *r = NULL;
+	struct rt_cache_iter_state *st = seq->private;
+
+	for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+		rcu_read_lock_bh();
+		r = rt_hash_table[st->bucket].chain;
+		if (r)
+			break;
+		rcu_read_unlock_bh();
+	}
+	return r;
+}
+
+static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
+{
+	struct rt_cache_iter_state *st = rcu_dereference(seq->private);
+
+	r = r->u.rt_next;
+	while (!r) {
+		rcu_read_unlock_bh();
+		if (--st->bucket < 0)
+			break;
+		rcu_read_lock_bh();
+		r = rt_hash_table[st->bucket].chain;
+	}
+	return r;
+}
+
+static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct rtable *r = rt_cache_get_first(seq);
+
+	if (r)
+		while (pos && (r = rt_cache_get_next(seq, r)))
+			--pos;
+	return pos ? NULL : r;
+}
+
+static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct rtable *r = NULL;
+
+	if (v == SEQ_START_TOKEN)
+		r = rt_cache_get_first(seq);
+	else
+		r = rt_cache_get_next(seq, v);
+	++*pos;
+	return r;
+}
+
+static void rt_cache_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v && v != SEQ_START_TOKEN)
+		rcu_read_unlock_bh();
+}
+
+static int rt_cache_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
+			   "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
+			   "HHUptod\tSpecDst");
+	else {
+		struct rtable *r = v;
+		char temp[256];
+
+		sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
+			      "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
+			r->u.dst.dev ? r->u.dst.dev->name : "*",
+			(unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
+			r->rt_flags, atomic_read(&r->u.dst.__refcnt),
+			r->u.dst.__use, 0, (unsigned long)r->rt_src,
+			(dst_metric(&r->u.dst, RTAX_ADVMSS) ?
+			     (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
+			dst_metric(&r->u.dst, RTAX_WINDOW),
+			(int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
+			      dst_metric(&r->u.dst, RTAX_RTTVAR)),
+			r->fl.fl4_tos,
+			r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
+			r->u.dst.hh ? (r->u.dst.hh->hh_output ==
+				       dev_queue_xmit) : 0,
+			r->rt_spec_dst);
+		seq_printf(seq, "%-127s\n", temp);
+        }
+  	return 0;
+}
+
+static struct seq_operations rt_cache_seq_ops = {
+	.start  = rt_cache_seq_start,
+	.next   = rt_cache_seq_next,
+	.stop   = rt_cache_seq_stop,
+	.show   = rt_cache_seq_show,
+};
+
+static int rt_cache_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		goto out;
+	rc = seq_open(file, &rt_cache_seq_ops);
+	if (rc)
+		goto out_kfree;
+	seq          = file->private_data;
+	seq->private = s;
+	memset(s, 0, sizeof(*s));
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+static struct file_operations rt_cache_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = rt_cache_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release_private,
+};
+
+
+static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	int cpu;
+
+	if (*pos == 0)
+		return SEQ_START_TOKEN;
+
+	for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return per_cpu_ptr(rt_cache_stat, cpu);
+	}
+	return NULL;
+}
+
+static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	int cpu;
+
+	for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+		if (!cpu_possible(cpu))
+			continue;
+		*pos = cpu+1;
+		return per_cpu_ptr(rt_cache_stat, cpu);
+	}
+	return NULL;
+	
+}
+
+static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
+{
+
+}
+
+static int rt_cpu_seq_show(struct seq_file *seq, void *v)
+{
+	struct rt_cache_stat *st = v;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "entries  in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
+		return 0;
+	}
+	
+	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
+		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
+		   atomic_read(&ipv4_dst_ops.entries),
+		   st->in_hit,
+		   st->in_slow_tot,
+		   st->in_slow_mc,
+		   st->in_no_route,
+		   st->in_brd,
+		   st->in_martian_dst,
+		   st->in_martian_src,
+
+		   st->out_hit,
+		   st->out_slow_tot,
+		   st->out_slow_mc, 
+
+		   st->gc_total,
+		   st->gc_ignored,
+		   st->gc_goal_miss,
+		   st->gc_dst_overflow,
+		   st->in_hlist_search,
+		   st->out_hlist_search
+		);
+	return 0;
+}
+
+static struct seq_operations rt_cpu_seq_ops = {
+	.start  = rt_cpu_seq_start,
+	.next   = rt_cpu_seq_next,
+	.stop   = rt_cpu_seq_stop,
+	.show   = rt_cpu_seq_show,
+};
+
+
+static int rt_cpu_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &rt_cpu_seq_ops);
+}
+
+static struct file_operations rt_cpu_seq_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = rt_cpu_seq_open,
+	.read	 = seq_read,
+	.llseek	 = seq_lseek,
+	.release = seq_release,
+};
+
+#endif /* CONFIG_PROC_FS */
+  
+static __inline__ void rt_free(struct rtable *rt)
+{
+	multipath_remove(rt);
+	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static __inline__ void rt_drop(struct rtable *rt)
+{
+	multipath_remove(rt);
+	ip_rt_put(rt);
+	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
+}
+
+static __inline__ int rt_fast_clean(struct rtable *rth)
+{
+	/* Kill broadcast/multicast entries very aggresively, if they
+	   collide in hash table with more useful entries */
+	return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
+		rth->fl.iif && rth->u.rt_next;
+}
+
+static __inline__ int rt_valuable(struct rtable *rth)
+{
+	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
+		rth->u.dst.expires;
+}
+
+static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
+{
+	unsigned long age;
+	int ret = 0;
+
+	if (atomic_read(&rth->u.dst.__refcnt))
+		goto out;
+
+	ret = 1;
+	if (rth->u.dst.expires &&
+	    time_after_eq(jiffies, rth->u.dst.expires))
+		goto out;
+
+	age = jiffies - rth->u.dst.lastuse;
+	ret = 0;
+	if ((age <= tmo1 && !rt_fast_clean(rth)) ||
+	    (age <= tmo2 && rt_valuable(rth)))
+		goto out;
+	ret = 1;
+out:	return ret;
+}
+
+/* Bits of score are:
+ * 31: very valuable
+ * 30: not quite useless
+ * 29..0: usage counter
+ */
+static inline u32 rt_score(struct rtable *rt)
+{
+	u32 score = jiffies - rt->u.dst.lastuse;
+
+	score = ~score & ~(3<<30);
+
+	if (rt_valuable(rt))
+		score |= (1<<31);
+
+	if (!rt->fl.iif ||
+	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
+		score |= (1<<30);
+
+	return score;
+}
+
+static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
+{
+	return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 &&
+	       fl1->oif     == fl2->oif &&
+	       fl1->iif     == fl2->iif;
+}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+static struct rtable **rt_remove_balanced_route(struct rtable **chain_head,
+						struct rtable *expentry,
+						int *removed_count)
+{
+	int passedexpired = 0;
+	struct rtable **nextstep = NULL;
+	struct rtable **rthp = chain_head;
+	struct rtable *rth;
+
+	if (removed_count)
+		*removed_count = 0;
+
+	while ((rth = *rthp) != NULL) {
+		if (rth == expentry)
+			passedexpired = 1;
+
+		if (((*rthp)->u.dst.flags & DST_BALANCED) != 0  &&
+		    compare_keys(&(*rthp)->fl, &expentry->fl)) {
+			if (*rthp == expentry) {
+				*rthp = rth->u.rt_next;
+				continue;
+			} else {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+				if (removed_count)
+					++(*removed_count);
+			}
+		} else {
+			if (!((*rthp)->u.dst.flags & DST_BALANCED) &&
+			    passedexpired && !nextstep)
+				nextstep = &rth->u.rt_next;
+
+			rthp = &rth->u.rt_next;
+		}
+	}
+
+	rt_free(expentry);
+	if (removed_count)
+		++(*removed_count);
+
+	return nextstep;
+}
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+
+
+/* This runs via a timer and thus is always in BH context. */
+static void rt_check_expire(unsigned long dummy)
+{
+	static int rover;
+	int i = rover, t;
+	struct rtable *rth, **rthp;
+	unsigned long now = jiffies;
+
+	for (t = ip_rt_gc_interval << rt_hash_log; t >= 0;
+	     t -= ip_rt_gc_timeout) {
+		unsigned long tmo = ip_rt_gc_timeout;
+
+		i = (i + 1) & rt_hash_mask;
+		rthp = &rt_hash_table[i].chain;
+
+		spin_lock(&rt_hash_table[i].lock);
+		while ((rth = *rthp) != NULL) {
+			if (rth->u.dst.expires) {
+				/* Entry is expired even if it is in use */
+				if (time_before_eq(now, rth->u.dst.expires)) {
+					tmo >>= 1;
+					rthp = &rth->u.rt_next;
+					continue;
+				}
+			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
+				tmo >>= 1;
+				rthp = &rth->u.rt_next;
+				continue;
+			}
+
+			/* Cleanup aged off entries. */
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+			/* remove all related balanced entries if necessary */
+			if (rth->u.dst.flags & DST_BALANCED) {
+				rthp = rt_remove_balanced_route(
+					&rt_hash_table[i].chain,
+					rth, NULL);
+				if (!rthp)
+					break;
+			} else {
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+			}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+ 			*rthp = rth->u.rt_next;
+ 			rt_free(rth);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+		}
+		spin_unlock(&rt_hash_table[i].lock);
+
+		/* Fallback loop breaker. */
+		if (time_after(jiffies, now))
+			break;
+	}
+	rover = i;
+	mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);
+}
+
+/* This can run from both BH and non-BH contexts, the latter
+ * in the case of a forced flush event.
+ */
+static void rt_run_flush(unsigned long dummy)
+{
+	int i;
+	struct rtable *rth, *next;
+
+	rt_deadline = 0;
+
+	get_random_bytes(&rt_hash_rnd, 4);
+
+	for (i = rt_hash_mask; i >= 0; i--) {
+		spin_lock_bh(&rt_hash_table[i].lock);
+		rth = rt_hash_table[i].chain;
+		if (rth)
+			rt_hash_table[i].chain = NULL;
+		spin_unlock_bh(&rt_hash_table[i].lock);
+
+		for (; rth; rth = next) {
+			next = rth->u.rt_next;
+			rt_free(rth);
+		}
+	}
+}
+
+static DEFINE_SPINLOCK(rt_flush_lock);
+
+void rt_cache_flush(int delay)
+{
+	unsigned long now = jiffies;
+	int user_mode = !in_softirq();
+
+	if (delay < 0)
+		delay = ip_rt_min_delay;
+
+	/* flush existing multipath state*/
+	multipath_flush();
+
+	spin_lock_bh(&rt_flush_lock);
+
+	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
+		long tmo = (long)(rt_deadline - now);
+
+		/* If flush timer is already running
+		   and flush request is not immediate (delay > 0):
+
+		   if deadline is not achieved, prolongate timer to "delay",
+		   otherwise fire it at deadline time.
+		 */
+
+		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
+			tmo = 0;
+		
+		if (delay > tmo)
+			delay = tmo;
+	}
+
+	if (delay <= 0) {
+		spin_unlock_bh(&rt_flush_lock);
+		rt_run_flush(0);
+		return;
+	}
+
+	if (rt_deadline == 0)
+		rt_deadline = now + ip_rt_max_delay;
+
+	mod_timer(&rt_flush_timer, now+delay);
+	spin_unlock_bh(&rt_flush_lock);
+}
+
+static void rt_secret_rebuild(unsigned long dummy)
+{
+	unsigned long now = jiffies;
+
+	rt_cache_flush(0);
+	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);
+}
+
+/*
+   Short description of GC goals.
+
+   We want to build algorithm, which will keep routing cache
+   at some equilibrium point, when number of aged off entries
+   is kept approximately equal to newly generated ones.
+
+   Current expiration strength is variable "expire".
+   We try to adjust it dynamically, so that if networking
+   is idle expires is large enough to keep enough of warm entries,
+   and when load increases it reduces to limit cache size.
+ */
+
+static int rt_garbage_collect(void)
+{
+	static unsigned long expire = RT_GC_TIMEOUT;
+	static unsigned long last_gc;
+	static int rover;
+	static int equilibrium;
+	struct rtable *rth, **rthp;
+	unsigned long now = jiffies;
+	int goal;
+
+	/*
+	 * Garbage collection is pretty expensive,
+	 * do not make it too frequently.
+	 */
+
+	RT_CACHE_STAT_INC(gc_total);
+
+	if (now - last_gc < ip_rt_gc_min_interval &&
+	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+		RT_CACHE_STAT_INC(gc_ignored);
+		goto out;
+	}
+
+	/* Calculate number of entries, which we want to expire now. */
+	goal = atomic_read(&ipv4_dst_ops.entries) -
+		(ip_rt_gc_elasticity << rt_hash_log);
+	if (goal <= 0) {
+		if (equilibrium < ipv4_dst_ops.gc_thresh)
+			equilibrium = ipv4_dst_ops.gc_thresh;
+		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+		if (goal > 0) {
+			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
+			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+		}
+	} else {
+		/* We are in dangerous area. Try to reduce cache really
+		 * aggressively.
+		 */
+		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
+		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+	}
+
+	if (now - last_gc >= ip_rt_gc_min_interval)
+		last_gc = now;
+
+	if (goal <= 0) {
+		equilibrium += goal;
+		goto work_done;
+	}
+
+	do {
+		int i, k;
+
+		for (i = rt_hash_mask, k = rover; i >= 0; i--) {
+			unsigned long tmo = expire;
+
+			k = (k + 1) & rt_hash_mask;
+			rthp = &rt_hash_table[k].chain;
+			spin_lock_bh(&rt_hash_table[k].lock);
+			while ((rth = *rthp) != NULL) {
+				if (!rt_may_expire(rth, tmo, expire)) {
+					tmo >>= 1;
+					rthp = &rth->u.rt_next;
+					continue;
+				}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+				/* remove all related balanced entries
+				 * if necessary
+				 */
+				if (rth->u.dst.flags & DST_BALANCED) {
+					int r;
+
+					rthp = rt_remove_balanced_route(
+						&rt_hash_table[i].chain,
+						rth,
+						&r);
+					goal -= r;
+					if (!rthp)
+						break;
+				} else {
+					*rthp = rth->u.rt_next;
+					rt_free(rth);
+					goal--;
+				}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+				*rthp = rth->u.rt_next;
+				rt_free(rth);
+				goal--;
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+			}
+			spin_unlock_bh(&rt_hash_table[k].lock);
+			if (goal <= 0)
+				break;
+		}
+		rover = k;
+
+		if (goal <= 0)
+			goto work_done;
+
+		/* Goal is not achieved. We stop process if:
+
+		   - if expire reduced to zero. Otherwise, expire is halfed.
+		   - if table is not full.
+		   - if we are called from interrupt.
+		   - jiffies check is just fallback/debug loop breaker.
+		     We will not spin here for long time in any case.
+		 */
+
+		RT_CACHE_STAT_INC(gc_goal_miss);
+
+		if (expire == 0)
+			break;
+
+		expire >>= 1;
+#if RT_CACHE_DEBUG >= 2
+		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
+				atomic_read(&ipv4_dst_ops.entries), goal, i);
+#endif
+
+		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+			goto out;
+	} while (!in_softirq() && time_before_eq(jiffies, now));
+
+	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+		goto out;
+	if (net_ratelimit())
+		printk(KERN_WARNING "dst cache overflow\n");
+	RT_CACHE_STAT_INC(gc_dst_overflow);
+	return 1;
+
+work_done:
+	expire += ip_rt_gc_min_interval;
+	if (expire > ip_rt_gc_timeout ||
+	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+		expire = ip_rt_gc_timeout;
+#if RT_CACHE_DEBUG >= 2
+	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
+			atomic_read(&ipv4_dst_ops.entries), goal, rover);
+#endif
+out:	return 0;
+}
+
+static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+{
+	struct rtable	*rth, **rthp;
+	unsigned long	now;
+	struct rtable *cand, **candp;
+	u32 		min_score;
+	int		chain_length;
+	int attempts = !in_softirq();
+
+restart:
+	chain_length = 0;
+	min_score = ~(u32)0;
+	cand = NULL;
+	candp = NULL;
+	now = jiffies;
+
+	rthp = &rt_hash_table[hash].chain;
+
+	spin_lock_bh(&rt_hash_table[hash].lock);
+	while ((rth = *rthp) != NULL) {
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+		if (!(rth->u.dst.flags & DST_BALANCED) &&
+		    compare_keys(&rth->fl, &rt->fl)) {
+#else
+		if (compare_keys(&rth->fl, &rt->fl)) {
+#endif
+			/* Put it first */
+			*rthp = rth->u.rt_next;
+			/*
+			 * Since lookup is lockfree, the deletion
+			 * must be visible to another weakly ordered CPU before
+			 * the insertion at the start of the hash chain.
+			 */
+			rcu_assign_pointer(rth->u.rt_next,
+					   rt_hash_table[hash].chain);
+			/*
+			 * Since lookup is lockfree, the update writes
+			 * must be ordered for consistency on SMP.
+			 */
+			rcu_assign_pointer(rt_hash_table[hash].chain, rth);
+
+			rth->u.dst.__use++;
+			dst_hold(&rth->u.dst);
+			rth->u.dst.lastuse = now;
+			spin_unlock_bh(&rt_hash_table[hash].lock);
+
+			rt_drop(rt);
+			*rp = rth;
+			return 0;
+		}
+
+		if (!atomic_read(&rth->u.dst.__refcnt)) {
+			u32 score = rt_score(rth);
+
+			if (score <= min_score) {
+				cand = rth;
+				candp = rthp;
+				min_score = score;
+			}
+		}
+
+		chain_length++;
+
+		rthp = &rth->u.rt_next;
+	}
+
+	if (cand) {
+		/* ip_rt_gc_elasticity used to be average length of chain
+		 * length, when exceeded gc becomes really aggressive.
+		 *
+		 * The second limit is less certain. At the moment it allows
+		 * only 2 entries per bucket. We will see.
+		 */
+		if (chain_length > ip_rt_gc_elasticity) {
+			*candp = cand->u.rt_next;
+			rt_free(cand);
+		}
+	}
+
+	/* Try to bind route to arp only if it is output
+	   route or unicast forwarding path.
+	 */
+	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+		int err = arp_bind_neighbour(&rt->u.dst);
+		if (err) {
+			spin_unlock_bh(&rt_hash_table[hash].lock);
+
+			if (err != -ENOBUFS) {
+				rt_drop(rt);
+				return err;
+			}
+
+			/* Neighbour tables are full and nothing
+			   can be released. Try to shrink route cache,
+			   it is most likely it holds some neighbour records.
+			 */
+			if (attempts-- > 0) {
+				int saved_elasticity = ip_rt_gc_elasticity;
+				int saved_int = ip_rt_gc_min_interval;
+				ip_rt_gc_elasticity	= 1;
+				ip_rt_gc_min_interval	= 0;
+				rt_garbage_collect();
+				ip_rt_gc_min_interval	= saved_int;
+				ip_rt_gc_elasticity	= saved_elasticity;
+				goto restart;
+			}
+
+			if (net_ratelimit())
+				printk(KERN_WARNING "Neighbour table overflow.\n");
+			rt_drop(rt);
+			return -ENOBUFS;
+		}
+	}
+
+	rt->u.rt_next = rt_hash_table[hash].chain;
+#if RT_CACHE_DEBUG >= 2
+	if (rt->u.rt_next) {
+		struct rtable *trt;
+		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
+		       NIPQUAD(rt->rt_dst));
+		for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next)
+			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
+		printk("\n");
+	}
+#endif
+	rt_hash_table[hash].chain = rt;
+	spin_unlock_bh(&rt_hash_table[hash].lock);
+	*rp = rt;
+	return 0;
+}
+
+void rt_bind_peer(struct rtable *rt, int create)
+{
+	static DEFINE_SPINLOCK(rt_peer_lock);
+	struct inet_peer *peer;
+
+	peer = inet_getpeer(rt->rt_dst, create);
+
+	spin_lock_bh(&rt_peer_lock);
+	if (rt->peer == NULL) {
+		rt->peer = peer;
+		peer = NULL;
+	}
+	spin_unlock_bh(&rt_peer_lock);
+	if (peer)
+		inet_putpeer(peer);
+}
+
+/*
+ * Peer allocation may fail only in serious out-of-memory conditions.  However
+ * we still can generate some output.
+ * Random ID selection looks a bit dangerous because we have no chances to
+ * select ID being unique in a reasonable period of time.
+ * But broken packet identifier may be better than no packet at all.
+ */
+static void ip_select_fb_ident(struct iphdr *iph)
+{
+	static DEFINE_SPINLOCK(ip_fb_id_lock);
+	static u32 ip_fallback_id;
+	u32 salt;
+
+	spin_lock_bh(&ip_fb_id_lock);
+	salt = secure_ip_id(ip_fallback_id ^ iph->daddr);
+	iph->id = htons(salt & 0xFFFF);
+	ip_fallback_id = salt;
+	spin_unlock_bh(&ip_fb_id_lock);
+}
+
+void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
+{
+	struct rtable *rt = (struct rtable *) dst;
+
+	if (rt) {
+		if (rt->peer == NULL)
+			rt_bind_peer(rt, 1);
+
+		/* If peer is attached to destination, it is never detached,
+		   so that we need not to grab a lock to dereference it.
+		 */
+		if (rt->peer) {
+			iph->id = htons(inet_getid(rt->peer, more));
+			return;
+		}
+	} else
+		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph));
+
+	ip_select_fb_ident(iph);
+}
+
+static void rt_del(unsigned hash, struct rtable *rt)
+{
+	struct rtable **rthp;
+
+	spin_lock_bh(&rt_hash_table[hash].lock);
+	ip_rt_put(rt);
+	for (rthp = &rt_hash_table[hash].chain; *rthp;
+	     rthp = &(*rthp)->u.rt_next)
+		if (*rthp == rt) {
+			*rthp = rt->u.rt_next;
+			rt_free(rt);
+			break;
+		}
+	spin_unlock_bh(&rt_hash_table[hash].lock);
+}
+
+void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
+		    u32 saddr, u8 tos, struct net_device *dev)
+{
+	int i, k;
+	struct in_device *in_dev = in_dev_get(dev);
+	struct rtable *rth, **rthp;
+	u32  skeys[2] = { saddr, 0 };
+	int  ikeys[2] = { dev->ifindex, 0 };
+
+	tos &= IPTOS_RT_MASK;
+
+	if (!in_dev)
+		return;
+
+	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
+	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
+		goto reject_redirect;
+
+	if (!IN_DEV_SHARED_MEDIA(in_dev)) {
+		if (!inet_addr_onlink(in_dev, new_gw, old_gw))
+			goto reject_redirect;
+		if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
+			goto reject_redirect;
+	} else {
+		if (inet_addr_type(new_gw) != RTN_UNICAST)
+			goto reject_redirect;
+	}
+
+	for (i = 0; i < 2; i++) {
+		for (k = 0; k < 2; k++) {
+			unsigned hash = rt_hash_code(daddr,
+						     skeys[i] ^ (ikeys[k] << 5),
+						     tos);
+
+			rthp=&rt_hash_table[hash].chain;
+
+			rcu_read_lock();
+			while ((rth = rcu_dereference(*rthp)) != NULL) {
+				struct rtable *rt;
+
+				if (rth->fl.fl4_dst != daddr ||
+				    rth->fl.fl4_src != skeys[i] ||
+				    rth->fl.fl4_tos != tos ||
+				    rth->fl.oif != ikeys[k] ||
+				    rth->fl.iif != 0) {
+					rthp = &rth->u.rt_next;
+					continue;
+				}
+
+				if (rth->rt_dst != daddr ||
+				    rth->rt_src != saddr ||
+				    rth->u.dst.error ||
+				    rth->rt_gateway != old_gw ||
+				    rth->u.dst.dev != dev)
+					break;
+
+				dst_hold(&rth->u.dst);
+				rcu_read_unlock();
+
+				rt = dst_alloc(&ipv4_dst_ops);
+				if (rt == NULL) {
+					ip_rt_put(rth);
+					in_dev_put(in_dev);
+					return;
+				}
+
+				/* Copy all the information. */
+				*rt = *rth;
+ 				INIT_RCU_HEAD(&rt->u.dst.rcu_head);
+				rt->u.dst.__use		= 1;
+				atomic_set(&rt->u.dst.__refcnt, 1);
+				rt->u.dst.child		= NULL;
+				if (rt->u.dst.dev)
+					dev_hold(rt->u.dst.dev);
+				if (rt->idev)
+					in_dev_hold(rt->idev);
+				rt->u.dst.obsolete	= 0;
+				rt->u.dst.lastuse	= jiffies;
+				rt->u.dst.path		= &rt->u.dst;
+				rt->u.dst.neighbour	= NULL;
+				rt->u.dst.hh		= NULL;
+				rt->u.dst.xfrm		= NULL;
+
+				rt->rt_flags		|= RTCF_REDIRECTED;
+
+				/* Gateway is different ... */
+				rt->rt_gateway		= new_gw;
+
+				/* Redirect received -> path was valid */
+				dst_confirm(&rth->u.dst);
+
+				if (rt->peer)
+					atomic_inc(&rt->peer->refcnt);
+
+				if (arp_bind_neighbour(&rt->u.dst) ||
+				    !(rt->u.dst.neighbour->nud_state &
+					    NUD_VALID)) {
+					if (rt->u.dst.neighbour)
+						neigh_event_send(rt->u.dst.neighbour, NULL);
+					ip_rt_put(rth);
+					rt_drop(rt);
+					goto do_next;
+				}
+
+				rt_del(hash, rth);
+				if (!rt_intern_hash(hash, rt, &rt))
+					ip_rt_put(rt);
+				goto do_next;
+			}
+			rcu_read_unlock();
+		do_next:
+			;
+		}
+	}
+	in_dev_put(in_dev);
+	return;
+
+reject_redirect:
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
+			"%u.%u.%u.%u ignored.\n"
+			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
+			"tos %02x\n",
+		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
+		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
+#endif
+	in_dev_put(in_dev);
+}
+
+static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable*)dst;
+	struct dst_entry *ret = dst;
+
+	if (rt) {
+		if (dst->obsolete) {
+			ip_rt_put(rt);
+			ret = NULL;
+		} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
+			   rt->u.dst.expires) {
+			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
+						     rt->fl.fl4_src ^
+							(rt->fl.oif << 5),
+						     rt->fl.fl4_tos);
+#if RT_CACHE_DEBUG >= 1
+			printk(KERN_DEBUG "ip_rt_advice: redirect to "
+					  "%u.%u.%u.%u/%02x dropped\n",
+				NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
+#endif
+			rt_del(hash, rt);
+			ret = NULL;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Algorithm:
+ *	1. The first ip_rt_redirect_number redirects are sent
+ *	   with exponential backoff, then we stop sending them at all,
+ *	   assuming that the host ignores our redirects.
+ *	2. If we did not see packets requiring redirects
+ *	   during ip_rt_redirect_silence, we assume that the host
+ *	   forgot redirected route and start to send redirects again.
+ *
+ * This algorithm is much cheaper and more intelligent than dumb load limiting
+ * in icmp.c.
+ *
+ * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
+ * and "frag. need" (breaks PMTU discovery) in icmp.c.
+ */
+
+void ip_rt_send_redirect(struct sk_buff *skb)
+{
+	struct rtable *rt = (struct rtable*)skb->dst;
+	struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
+
+	if (!in_dev)
+		return;
+
+	if (!IN_DEV_TX_REDIRECTS(in_dev))
+		goto out;
+
+	/* No redirected packets during ip_rt_redirect_silence;
+	 * reset the algorithm.
+	 */
+	if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
+		rt->u.dst.rate_tokens = 0;
+
+	/* Too many ignored redirects; do not send anything
+	 * set u.dst.rate_last to the last seen redirected packet.
+	 */
+	if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
+		rt->u.dst.rate_last = jiffies;
+		goto out;
+	}
+
+	/* Check for load limit; set rate_last to the latest sent
+	 * redirect.
+	 */
+	if (time_after(jiffies,
+		       (rt->u.dst.rate_last +
+			(ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
+		icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
+		rt->u.dst.rate_last = jiffies;
+		++rt->u.dst.rate_tokens;
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+		if (IN_DEV_LOG_MARTIANS(in_dev) &&
+		    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
+		    net_ratelimit())
+			printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
+				"redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
+				NIPQUAD(rt->rt_src), rt->rt_iif,
+				NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
+#endif
+	}
+out:
+        in_dev_put(in_dev);
+}
+
+static int ip_error(struct sk_buff *skb)
+{
+	struct rtable *rt = (struct rtable*)skb->dst;
+	unsigned long now;
+	int code;
+
+	switch (rt->u.dst.error) {
+		case EINVAL:
+		default:
+			goto out;
+		case EHOSTUNREACH:
+			code = ICMP_HOST_UNREACH;
+			break;
+		case ENETUNREACH:
+			code = ICMP_NET_UNREACH;
+			break;
+		case EACCES:
+			code = ICMP_PKT_FILTERED;
+			break;
+	}
+
+	now = jiffies;
+	rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
+	if (rt->u.dst.rate_tokens > ip_rt_error_burst)
+		rt->u.dst.rate_tokens = ip_rt_error_burst;
+	rt->u.dst.rate_last = now;
+	if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
+		rt->u.dst.rate_tokens -= ip_rt_error_cost;
+		icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
+	}
+
+out:	kfree_skb(skb);
+	return 0;
+} 
+
+/*
+ *	The last two values are not from the RFC but
+ *	are needed for AMPRnet AX.25 paths.
+ */
+
+static unsigned short mtu_plateau[] =
+{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
+
+static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
+{
+	int i;
+	
+	for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
+		if (old_mtu > mtu_plateau[i])
+			return mtu_plateau[i];
+	return 68;
+}
+
+unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
+{
+	int i;
+	unsigned short old_mtu = ntohs(iph->tot_len);
+	struct rtable *rth;
+	u32  skeys[2] = { iph->saddr, 0, };
+	u32  daddr = iph->daddr;
+	u8   tos = iph->tos & IPTOS_RT_MASK;
+	unsigned short est_mtu = 0;
+
+	if (ipv4_config.no_pmtu_disc)
+		return 0;
+
+	for (i = 0; i < 2; i++) {
+		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+
+		rcu_read_lock();
+		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+		     rth = rcu_dereference(rth->u.rt_next)) {
+			if (rth->fl.fl4_dst == daddr &&
+			    rth->fl.fl4_src == skeys[i] &&
+			    rth->rt_dst  == daddr &&
+			    rth->rt_src  == iph->saddr &&
+			    rth->fl.fl4_tos == tos &&
+			    rth->fl.iif == 0 &&
+			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
+				unsigned short mtu = new_mtu;
+
+				if (new_mtu < 68 || new_mtu >= old_mtu) {
+
+					/* BSD 4.2 compatibility hack :-( */
+					if (mtu == 0 &&
+					    old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
+					    old_mtu >= 68 + (iph->ihl << 2))
+						old_mtu -= iph->ihl << 2;
+
+					mtu = guess_mtu(old_mtu);
+				}
+				if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
+					if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 
+						dst_confirm(&rth->u.dst);
+						if (mtu < ip_rt_min_pmtu) {
+							mtu = ip_rt_min_pmtu;
+							rth->u.dst.metrics[RTAX_LOCK-1] |=
+								(1 << RTAX_MTU);
+						}
+						rth->u.dst.metrics[RTAX_MTU-1] = mtu;
+						dst_set_expires(&rth->u.dst,
+							ip_rt_mtu_expires);
+					}
+					est_mtu = mtu;
+				}
+			}
+		}
+		rcu_read_unlock();
+	}
+	return est_mtu ? : new_mtu;
+}
+
+static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
+	    !(dst_metric_locked(dst, RTAX_MTU))) {
+		if (mtu < ip_rt_min_pmtu) {
+			mtu = ip_rt_min_pmtu;
+			dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
+		}
+		dst->metrics[RTAX_MTU-1] = mtu;
+		dst_set_expires(dst, ip_rt_mtu_expires);
+	}
+}
+
+static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
+{
+	return NULL;
+}
+
+static void ipv4_dst_destroy(struct dst_entry *dst)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct inet_peer *peer = rt->peer;
+	struct in_device *idev = rt->idev;
+
+	if (peer) {
+		rt->peer = NULL;
+		inet_putpeer(peer);
+	}
+
+	if (idev) {
+		rt->idev = NULL;
+		in_dev_put(idev);
+	}
+}
+
+static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
+			    int how)
+{
+	struct rtable *rt = (struct rtable *) dst;
+	struct in_device *idev = rt->idev;
+	if (dev != &loopback_dev && idev && idev->dev == dev) {
+		struct in_device *loopback_idev = in_dev_get(&loopback_dev);
+		if (loopback_idev) {
+			rt->idev = loopback_idev;
+			in_dev_put(idev);
+		}
+	}
+}
+
+static void ipv4_link_failure(struct sk_buff *skb)
+{
+	struct rtable *rt;
+
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
+
+	rt = (struct rtable *) skb->dst;
+	if (rt)
+		dst_set_expires(&rt->u.dst, 0);
+}
+
+static int ip_rt_bug(struct sk_buff *skb)
+{
+	printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
+		NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr),
+		skb->dev ? skb->dev->name : "?");
+	kfree_skb(skb);
+	return 0;
+}
+
+/*
+   We do not cache source address of outgoing interface,
+   because it is used only by IP RR, TS and SRR options,
+   so that it out of fast path.
+
+   BTW remember: "addr" is allowed to be not aligned
+   in IP options!
+ */
+
+void ip_rt_get_source(u8 *addr, struct rtable *rt)
+{
+	u32 src;
+	struct fib_result res;
+
+	if (rt->fl.iif == 0)
+		src = rt->rt_src;
+	else if (fib_lookup(&rt->fl, &res) == 0) {
+		src = FIB_RES_PREFSRC(res);
+		fib_res_put(&res);
+	} else
+		src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
+					RT_SCOPE_UNIVERSE);
+	memcpy(addr, &src, 4);
+}
+
+#ifdef CONFIG_NET_CLS_ROUTE
+static void set_class_tag(struct rtable *rt, u32 tag)
+{
+	if (!(rt->u.dst.tclassid & 0xFFFF))
+		rt->u.dst.tclassid |= tag & 0xFFFF;
+	if (!(rt->u.dst.tclassid & 0xFFFF0000))
+		rt->u.dst.tclassid |= tag & 0xFFFF0000;
+}
+#endif
+
+static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
+{
+	struct fib_info *fi = res->fi;
+
+	if (fi) {
+		if (FIB_RES_GW(*res) &&
+		    FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
+			rt->rt_gateway = FIB_RES_GW(*res);
+		memcpy(rt->u.dst.metrics, fi->fib_metrics,
+		       sizeof(rt->u.dst.metrics));
+		if (fi->fib_mtu == 0) {
+			rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
+			if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
+			    rt->rt_gateway != rt->rt_dst &&
+			    rt->u.dst.dev->mtu > 576)
+				rt->u.dst.metrics[RTAX_MTU-1] = 576;
+		}
+#ifdef CONFIG_NET_CLS_ROUTE
+		rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
+#endif
+	} else
+		rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
+
+	if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
+		rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
+	if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
+		rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
+	if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
+		rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
+				       ip_rt_min_advmss);
+	if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
+		rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
+
+#ifdef CONFIG_NET_CLS_ROUTE
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	set_class_tag(rt, fib_rules_tclass(res));
+#endif
+	set_class_tag(rt, itag);
+#endif
+        rt->rt_type = res->type;
+}
+
+static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr,
+				u8 tos, struct net_device *dev, int our)
+{
+	unsigned hash;
+	struct rtable *rth;
+	u32 spec_dst;
+	struct in_device *in_dev = in_dev_get(dev);
+	u32 itag = 0;
+
+	/* Primary sanity checks. */
+
+	if (in_dev == NULL)
+		return -EINVAL;
+
+	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
+	    skb->protocol != htons(ETH_P_IP))
+		goto e_inval;
+
+	if (ZERONET(saddr)) {
+		if (!LOCAL_MCAST(daddr))
+			goto e_inval;
+		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+	} else if (fib_validate_source(saddr, 0, tos, 0,
+					dev, &spec_dst, &itag) < 0)
+		goto e_inval;
+
+	rth = dst_alloc(&ipv4_dst_ops);
+	if (!rth)
+		goto e_nobufs;
+
+	rth->u.dst.output= ip_rt_bug;
+
+	atomic_set(&rth->u.dst.__refcnt, 1);
+	rth->u.dst.flags= DST_HOST;
+	if (in_dev->cnf.no_policy)
+		rth->u.dst.flags |= DST_NOPOLICY;
+	rth->fl.fl4_dst	= daddr;
+	rth->rt_dst	= daddr;
+	rth->fl.fl4_tos	= tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+	rth->fl.fl4_src	= saddr;
+	rth->rt_src	= saddr;
+#ifdef CONFIG_NET_CLS_ROUTE
+	rth->u.dst.tclassid = itag;
+#endif
+	rth->rt_iif	=
+	rth->fl.iif	= dev->ifindex;
+	rth->u.dst.dev	= &loopback_dev;
+	dev_hold(rth->u.dst.dev);
+	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->fl.oif	= 0;
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->rt_type	= RTN_MULTICAST;
+	rth->rt_flags	= RTCF_MULTICAST;
+	if (our) {
+		rth->u.dst.input= ip_local_deliver;
+		rth->rt_flags |= RTCF_LOCAL;
+	}
+
+#ifdef CONFIG_IP_MROUTE
+	if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+		rth->u.dst.input = ip_mr_input;
+#endif
+	RT_CACHE_STAT_INC(in_slow_mc);
+
+	in_dev_put(in_dev);
+	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
+	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
+
+e_nobufs:
+	in_dev_put(in_dev);
+	return -ENOBUFS;
+
+e_inval:
+	in_dev_put(in_dev);
+	return -EINVAL;
+}
+
+
+static void ip_handle_martian_source(struct net_device *dev,
+				     struct in_device *in_dev,
+				     struct sk_buff *skb,
+				     u32 daddr,
+				     u32 saddr) 
+{
+	RT_CACHE_STAT_INC(in_martian_src);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
+		/*
+		 *	RFC1812 recommendation, if source is martian,
+		 *	the only hint is MAC header.
+		 */
+		printk(KERN_WARNING "martian source %u.%u.%u.%u from "
+			"%u.%u.%u.%u, on dev %s\n",
+			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+		if (dev->hard_header_len) {
+			int i;
+			unsigned char *p = skb->mac.raw;
+			printk(KERN_WARNING "ll header: ");
+			for (i = 0; i < dev->hard_header_len; i++, p++) {
+				printk("%02x", *p);
+				if (i < (dev->hard_header_len - 1))
+					printk(":");
+			}
+			printk("\n");
+		}
+	}
+#endif
+}
+
+static inline int __mkroute_input(struct sk_buff *skb, 
+				  struct fib_result* res, 
+				  struct in_device *in_dev, 
+				  u32 daddr, u32 saddr, u32 tos, 
+				  struct rtable **result) 
+{
+
+	struct rtable *rth;
+	int err;
+	struct in_device *out_dev;
+	unsigned flags = 0;
+	u32 spec_dst, itag;
+
+	/* get a working reference to the output device */
+	out_dev = in_dev_get(FIB_RES_DEV(*res));
+	if (out_dev == NULL) {
+		if (net_ratelimit())
+			printk(KERN_CRIT "Bug in ip_route_input" \
+			       "_slow(). Please, report\n");
+		return -EINVAL;
+	}
+
+
+	err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 
+				  in_dev->dev, &spec_dst, &itag);
+	if (err < 0) {
+		ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 
+					 saddr);
+		
+		err = -EINVAL;
+		goto cleanup;
+	}
+
+	if (err)
+		flags |= RTCF_DIRECTSRC;
+
+	if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
+	    (IN_DEV_SHARED_MEDIA(out_dev) ||
+	     inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
+		flags |= RTCF_DOREDIRECT;
+
+	if (skb->protocol != htons(ETH_P_IP)) {
+		/* Not IP (i.e. ARP). Do not create route, if it is
+		 * invalid for proxy arp. DNAT routes are always valid.
+		 */
+		if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
+			err = -EINVAL;
+			goto cleanup;
+		}
+	}
+
+
+	rth = dst_alloc(&ipv4_dst_ops);
+	if (!rth) {
+		err = -ENOBUFS;
+		goto cleanup;
+	}
+
+	rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if (res->fi->fib_nhs > 1)
+		rth->u.dst.flags |= DST_BALANCED;
+#endif
+	if (in_dev->cnf.no_policy)
+		rth->u.dst.flags |= DST_NOPOLICY;
+	if (in_dev->cnf.no_xfrm)
+		rth->u.dst.flags |= DST_NOXFRM;
+	rth->fl.fl4_dst	= daddr;
+	rth->rt_dst	= daddr;
+	rth->fl.fl4_tos	= tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+	rth->fl.fl4_src	= saddr;
+	rth->rt_src	= saddr;
+	rth->rt_gateway	= daddr;
+	rth->rt_iif 	=
+		rth->fl.iif	= in_dev->dev->ifindex;
+	rth->u.dst.dev	= (out_dev)->dev;
+	dev_hold(rth->u.dst.dev);
+	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->fl.oif 	= 0;
+	rth->rt_spec_dst= spec_dst;
+
+	rth->u.dst.input = ip_forward;
+	rth->u.dst.output = ip_output;
+
+	rt_set_nexthop(rth, res, itag);
+
+	rth->rt_flags = flags;
+
+	*result = rth;
+	err = 0;
+ cleanup:
+	/* release the working reference to the output device */
+	in_dev_put(out_dev);
+	return err;
+}						
+
+static inline int ip_mkroute_input_def(struct sk_buff *skb, 
+				       struct fib_result* res, 
+				       const struct flowi *fl,
+				       struct in_device *in_dev,
+				       u32 daddr, u32 saddr, u32 tos)
+{
+	struct rtable* rth;
+	int err;
+	unsigned hash;
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
+		fib_select_multipath(fl, res);
+#endif
+
+	/* create a routing cache entry */
+	err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
+	if (err)
+		return err;
+	atomic_set(&rth->u.dst.__refcnt, 1);
+
+	/* put it into the cache */
+	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);	
+}
+
+static inline int ip_mkroute_input(struct sk_buff *skb, 
+				   struct fib_result* res, 
+				   const struct flowi *fl,
+				   struct in_device *in_dev,
+				   u32 daddr, u32 saddr, u32 tos)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	struct rtable* rth;
+	unsigned char hop, hopcount, lasthop;
+	int err = -EINVAL;
+	unsigned int hash;
+
+	if (res->fi)
+		hopcount = res->fi->fib_nhs;
+	else
+		hopcount = 1;
+
+	lasthop = hopcount - 1;
+
+	/* distinguish between multipath and singlepath */
+	if (hopcount < 2)
+		return ip_mkroute_input_def(skb, res, fl, in_dev, daddr,
+					    saddr, tos);
+	
+	/* add all alternatives to the routing cache */
+	for (hop = 0; hop < hopcount; hop++) {
+		res->nh_sel = hop;
+
+		/* create a routing cache entry */
+		err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos,
+				      &rth);
+		if (err)
+			return err;
+
+		/* put it into the cache */
+		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+		err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+		if (err)
+			return err;
+
+		/* forward hop information to multipath impl. */
+		multipath_set_nhinfo(rth,
+				     FIB_RES_NETWORK(*res),
+				     FIB_RES_NETMASK(*res),
+				     res->prefixlen,
+				     &FIB_RES_NH(*res));
+
+		/* only for the last hop the reference count is handled
+		 * outside
+		 */
+		if (hop == lasthop)
+			atomic_set(&(skb->dst->__refcnt), 1);
+	}
+	return err;
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
+	return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos);
+#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED  */
+}
+
+
+/*
+ *	NOTE. We drop all the packets that has local source
+ *	addresses, because every properly looped back packet
+ *	must have correct destination already attached by output routine.
+ *
+ *	Such approach solves two big problems:
+ *	1. Not simplex devices are handled properly.
+ *	2. IP spoofing attempts are filtered with 100% of guarantee.
+ */
+
+static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr,
+			       u8 tos, struct net_device *dev)
+{
+	struct fib_result res;
+	struct in_device *in_dev = in_dev_get(dev);
+	struct flowi fl = { .nl_u = { .ip4_u =
+				      { .daddr = daddr,
+					.saddr = saddr,
+					.tos = tos,
+					.scope = RT_SCOPE_UNIVERSE,
+#ifdef CONFIG_IP_ROUTE_FWMARK
+					.fwmark = skb->nfmark
+#endif
+				      } },
+			    .iif = dev->ifindex };
+	unsigned	flags = 0;
+	u32		itag = 0;
+	struct rtable * rth;
+	unsigned	hash;
+	u32		spec_dst;
+	int		err = -EINVAL;
+	int		free_res = 0;
+
+	/* IP on this device is disabled. */
+
+	if (!in_dev)
+		goto out;
+
+	/* Check for the most weird martians, which can be not detected
+	   by fib_lookup.
+	 */
+
+	if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
+		goto martian_source;
+
+	if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0))
+		goto brd_input;
+
+	/* Accept zero addresses only to limited broadcast;
+	 * I even do not know to fix it or not. Waiting for complains :-)
+	 */
+	if (ZERONET(saddr))
+		goto martian_source;
+
+	if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
+		goto martian_destination;
+
+	/*
+	 *	Now we are ready to route packet.
+	 */
+	if ((err = fib_lookup(&fl, &res)) != 0) {
+		if (!IN_DEV_FORWARD(in_dev))
+			goto e_inval;
+		goto no_route;
+	}
+	free_res = 1;
+
+	RT_CACHE_STAT_INC(in_slow_tot);
+
+	if (res.type == RTN_BROADCAST)
+		goto brd_input;
+
+	if (res.type == RTN_LOCAL) {
+		int result;
+		result = fib_validate_source(saddr, daddr, tos,
+					     loopback_dev.ifindex,
+					     dev, &spec_dst, &itag);
+		if (result < 0)
+			goto martian_source;
+		if (result)
+			flags |= RTCF_DIRECTSRC;
+		spec_dst = daddr;
+		goto local_input;
+	}
+
+	if (!IN_DEV_FORWARD(in_dev))
+		goto e_inval;
+	if (res.type != RTN_UNICAST)
+		goto martian_destination;
+
+	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
+	if (err == -ENOBUFS)
+		goto e_nobufs;
+	if (err == -EINVAL)
+		goto e_inval;
+	
+done:
+	in_dev_put(in_dev);
+	if (free_res)
+		fib_res_put(&res);
+out:	return err;
+
+brd_input:
+	if (skb->protocol != htons(ETH_P_IP))
+		goto e_inval;
+
+	if (ZERONET(saddr))
+		spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
+	else {
+		err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
+					  &itag);
+		if (err < 0)
+			goto martian_source;
+		if (err)
+			flags |= RTCF_DIRECTSRC;
+	}
+	flags |= RTCF_BROADCAST;
+	res.type = RTN_BROADCAST;
+	RT_CACHE_STAT_INC(in_brd);
+
+local_input:
+	rth = dst_alloc(&ipv4_dst_ops);
+	if (!rth)
+		goto e_nobufs;
+
+	rth->u.dst.output= ip_rt_bug;
+
+	atomic_set(&rth->u.dst.__refcnt, 1);
+	rth->u.dst.flags= DST_HOST;
+	if (in_dev->cnf.no_policy)
+		rth->u.dst.flags |= DST_NOPOLICY;
+	rth->fl.fl4_dst	= daddr;
+	rth->rt_dst	= daddr;
+	rth->fl.fl4_tos	= tos;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	rth->fl.fl4_fwmark= skb->nfmark;
+#endif
+	rth->fl.fl4_src	= saddr;
+	rth->rt_src	= saddr;
+#ifdef CONFIG_NET_CLS_ROUTE
+	rth->u.dst.tclassid = itag;
+#endif
+	rth->rt_iif	=
+	rth->fl.iif	= dev->ifindex;
+	rth->u.dst.dev	= &loopback_dev;
+	dev_hold(rth->u.dst.dev);
+	rth->idev	= in_dev_get(rth->u.dst.dev);
+	rth->rt_gateway	= daddr;
+	rth->rt_spec_dst= spec_dst;
+	rth->u.dst.input= ip_local_deliver;
+	rth->rt_flags 	= flags|RTCF_LOCAL;
+	if (res.type == RTN_UNREACHABLE) {
+		rth->u.dst.input= ip_error;
+		rth->u.dst.error= -err;
+		rth->rt_flags 	&= ~RTCF_LOCAL;
+	}
+	rth->rt_type	= res.type;
+	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
+	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
+	goto done;
+
+no_route:
+	RT_CACHE_STAT_INC(in_no_route);
+	spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
+	res.type = RTN_UNREACHABLE;
+	goto local_input;
+
+	/*
+	 *	Do not cache martian addresses: they should be logged (RFC1812)
+	 */
+martian_destination:
+	RT_CACHE_STAT_INC(in_martian_dst);
+#ifdef CONFIG_IP_ROUTE_VERBOSE
+	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
+		printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
+			"%u.%u.%u.%u, dev %s\n",
+			NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+#endif
+e_inval:
+	err = -EINVAL;
+	goto done;
+
+e_nobufs:
+	err = -ENOBUFS;
+	goto done;
+
+martian_source:
+	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
+	goto e_inval;
+}
+
+int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
+		   u8 tos, struct net_device *dev)
+{
+	struct rtable * rth;
+	unsigned	hash;
+	int iif = dev->ifindex;
+
+	tos &= IPTOS_RT_MASK;
+	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
+
+	rcu_read_lock();
+	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+	     rth = rcu_dereference(rth->u.rt_next)) {
+		if (rth->fl.fl4_dst == daddr &&
+		    rth->fl.fl4_src == saddr &&
+		    rth->fl.iif == iif &&
+		    rth->fl.oif == 0 &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		    rth->fl.fl4_fwmark == skb->nfmark &&
+#endif
+		    rth->fl.fl4_tos == tos) {
+			rth->u.dst.lastuse = jiffies;
+			dst_hold(&rth->u.dst);
+			rth->u.dst.__use++;
+			RT_CACHE_STAT_INC(in_hit);
+			rcu_read_unlock();
+			skb->dst = (struct dst_entry*)rth;
+			return 0;
+		}
+		RT_CACHE_STAT_INC(in_hlist_search);
+	}
+	rcu_read_unlock();
+
+	/* Multicast recognition logic is moved from route cache to here.
+	   The problem was that too many Ethernet cards have broken/missing
+	   hardware multicast filters :-( As result the host on multicasting
+	   network acquires a lot of useless route cache entries, sort of
+	   SDR messages from all the world. Now we try to get rid of them.
+	   Really, provided software IP multicast filter is organized
+	   reasonably (at least, hashed), it does not result in a slowdown
+	   comparing with route cache reject entries.
+	   Note, that multicast routers are not affected, because
+	   route cache entry is created eventually.
+	 */
+	if (MULTICAST(daddr)) {
+		struct in_device *in_dev;
+
+		rcu_read_lock();
+		if ((in_dev = __in_dev_get(dev)) != NULL) {
+			int our = ip_check_mc(in_dev, daddr, saddr,
+				skb->nh.iph->protocol);
+			if (our
+#ifdef CONFIG_IP_MROUTE
+			    || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
+#endif
+			    ) {
+				rcu_read_unlock();
+				return ip_route_input_mc(skb, daddr, saddr,
+							 tos, dev, our);
+			}
+		}
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
+}
+
+static inline int __mkroute_output(struct rtable **result,
+				   struct fib_result* res, 
+				   const struct flowi *fl,
+				   const struct flowi *oldflp, 
+				   struct net_device *dev_out, 
+				   unsigned flags) 
+{
+	struct rtable *rth;
+	struct in_device *in_dev;
+	u32 tos = RT_FL_TOS(oldflp);
+	int err = 0;
+
+	if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+		return -EINVAL;
+
+	if (fl->fl4_dst == 0xFFFFFFFF)
+		res->type = RTN_BROADCAST;
+	else if (MULTICAST(fl->fl4_dst))
+		res->type = RTN_MULTICAST;
+	else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
+		return -EINVAL;
+
+	if (dev_out->flags & IFF_LOOPBACK)
+		flags |= RTCF_LOCAL;
+
+	/* get work reference to inet device */
+	in_dev = in_dev_get(dev_out);
+	if (!in_dev)
+		return -EINVAL;
+
+	if (res->type == RTN_BROADCAST) {
+		flags |= RTCF_BROADCAST | RTCF_LOCAL;
+		if (res->fi) {
+			fib_info_put(res->fi);
+			res->fi = NULL;
+		}
+	} else if (res->type == RTN_MULTICAST) {
+		flags |= RTCF_MULTICAST|RTCF_LOCAL;
+		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 
+				 oldflp->proto))
+			flags &= ~RTCF_LOCAL;
+		/* If multicast route do not exist use
+		   default one, but do not gateway in this case.
+		   Yes, it is hack.
+		 */
+		if (res->fi && res->prefixlen < 4) {
+			fib_info_put(res->fi);
+			res->fi = NULL;
+		}
+	}
+
+
+	rth = dst_alloc(&ipv4_dst_ops);
+	if (!rth) {
+		err = -ENOBUFS;
+		goto cleanup;
+	}		
+
+	rth->u.dst.flags= DST_HOST;
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if (res->fi) {
+		rth->rt_multipath_alg = res->fi->fib_mp_alg;
+		if (res->fi->fib_nhs > 1)
+			rth->u.dst.flags |= DST_BALANCED;
+	}
+#endif
+	if (in_dev->cnf.no_xfrm)
+		rth->u.dst.flags |= DST_NOXFRM;
+	if (in_dev->cnf.no_policy)
+		rth->u.dst.flags |= DST_NOPOLICY;
+
+	rth->fl.fl4_dst	= oldflp->fl4_dst;
+	rth->fl.fl4_tos	= tos;
+	rth->fl.fl4_src	= oldflp->fl4_src;
+	rth->fl.oif	= oldflp->oif;
+#ifdef CONFIG_IP_ROUTE_FWMARK
+	rth->fl.fl4_fwmark= oldflp->fl4_fwmark;
+#endif
+	rth->rt_dst	= fl->fl4_dst;
+	rth->rt_src	= fl->fl4_src;
+	rth->rt_iif	= oldflp->oif ? : dev_out->ifindex;
+	/* get references to the devices that are to be hold by the routing 
+	   cache entry */
+	rth->u.dst.dev	= dev_out;
+	dev_hold(dev_out);
+	rth->idev	= in_dev_get(dev_out);
+	rth->rt_gateway = fl->fl4_dst;
+	rth->rt_spec_dst= fl->fl4_src;
+
+	rth->u.dst.output=ip_output;
+
+	RT_CACHE_STAT_INC(out_slow_tot);
+
+	if (flags & RTCF_LOCAL) {
+		rth->u.dst.input = ip_local_deliver;
+		rth->rt_spec_dst = fl->fl4_dst;
+	}
+	if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
+		rth->rt_spec_dst = fl->fl4_src;
+		if (flags & RTCF_LOCAL && 
+		    !(dev_out->flags & IFF_LOOPBACK)) {
+			rth->u.dst.output = ip_mc_output;
+			RT_CACHE_STAT_INC(out_slow_mc);
+		}
+#ifdef CONFIG_IP_MROUTE
+		if (res->type == RTN_MULTICAST) {
+			if (IN_DEV_MFORWARD(in_dev) &&
+			    !LOCAL_MCAST(oldflp->fl4_dst)) {
+				rth->u.dst.input = ip_mr_input;
+				rth->u.dst.output = ip_mc_output;
+			}
+		}
+#endif
+	}
+
+	rt_set_nexthop(rth, res, 0);
+
+	rth->rt_flags = flags;
+
+	*result = rth;
+ cleanup:
+	/* release work reference to inet device */
+	in_dev_put(in_dev);
+
+	return err;
+}
+
+static inline int ip_mkroute_output_def(struct rtable **rp,
+					struct fib_result* res,
+					const struct flowi *fl,
+					const struct flowi *oldflp,
+					struct net_device *dev_out,
+					unsigned flags)
+{
+	struct rtable *rth;
+	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
+	unsigned hash;
+	if (err == 0) {
+		u32 tos = RT_FL_TOS(oldflp);
+
+		atomic_set(&rth->u.dst.__refcnt, 1);
+		
+		hash = rt_hash_code(oldflp->fl4_dst, 
+				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
+		err = rt_intern_hash(hash, rth, rp);
+	}
+	
+	return err;
+}
+
+static inline int ip_mkroute_output(struct rtable** rp,
+				    struct fib_result* res,
+				    const struct flowi *fl,
+				    const struct flowi *oldflp,
+				    struct net_device *dev_out,
+				    unsigned flags)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	u32 tos = RT_FL_TOS(oldflp);
+	unsigned char hop;
+	unsigned hash;
+	int err = -EINVAL;
+	struct rtable *rth;
+
+	if (res->fi && res->fi->fib_nhs > 1) {
+		unsigned char hopcount = res->fi->fib_nhs;
+
+		for (hop = 0; hop < hopcount; hop++) {
+			struct net_device *dev2nexthop;
+
+			res->nh_sel = hop;
+
+			/* hold a work reference to the output device */
+			dev2nexthop = FIB_RES_DEV(*res);
+			dev_hold(dev2nexthop);
+
+			err = __mkroute_output(&rth, res, fl, oldflp,
+					       dev2nexthop, flags);
+
+			if (err != 0)
+				goto cleanup;
+
+			hash = rt_hash_code(oldflp->fl4_dst, 
+					    oldflp->fl4_src ^
+					    (oldflp->oif << 5), tos);
+			err = rt_intern_hash(hash, rth, rp);
+
+			/* forward hop information to multipath impl. */
+			multipath_set_nhinfo(rth,
+					     FIB_RES_NETWORK(*res),
+					     FIB_RES_NETMASK(*res),
+					     res->prefixlen,
+					     &FIB_RES_NH(*res));
+		cleanup:
+			/* release work reference to output device */
+			dev_put(dev2nexthop);
+
+			if (err != 0)
+				return err;
+		}
+		atomic_set(&(*rp)->u.dst.__refcnt, 1);
+		return err;
+	} else {
+		return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out,
+					     flags);
+	}
+#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */
+	return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags);
+#endif
+}
+
+/*
+ * Major route resolver routine.
+ */
+
+static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
+{
+	u32 tos	= RT_FL_TOS(oldflp);
+	struct flowi fl = { .nl_u = { .ip4_u =
+				      { .daddr = oldflp->fl4_dst,
+					.saddr = oldflp->fl4_src,
+					.tos = tos & IPTOS_RT_MASK,
+					.scope = ((tos & RTO_ONLINK) ?
+						  RT_SCOPE_LINK :
+						  RT_SCOPE_UNIVERSE),
+#ifdef CONFIG_IP_ROUTE_FWMARK
+					.fwmark = oldflp->fl4_fwmark
+#endif
+				      } },
+			    .iif = loopback_dev.ifindex,
+			    .oif = oldflp->oif };
+	struct fib_result res;
+	unsigned flags = 0;
+	struct net_device *dev_out = NULL;
+	int free_res = 0;
+	int err;
+
+
+	res.fi		= NULL;
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	res.r		= NULL;
+#endif
+
+	if (oldflp->fl4_src) {
+		err = -EINVAL;
+		if (MULTICAST(oldflp->fl4_src) ||
+		    BADCLASS(oldflp->fl4_src) ||
+		    ZERONET(oldflp->fl4_src))
+			goto out;
+
+		/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+		dev_out = ip_dev_find(oldflp->fl4_src);
+		if (dev_out == NULL)
+			goto out;
+
+		/* I removed check for oif == dev_out->oif here.
+		   It was wrong for two reasons:
+		   1. ip_dev_find(saddr) can return wrong iface, if saddr is
+		      assigned to multiple interfaces.
+		   2. Moreover, we are allowed to send packets with saddr
+		      of another iface. --ANK
+		 */
+
+		if (oldflp->oif == 0
+		    && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) {
+			/* Special hack: user can direct multicasts
+			   and limited broadcast via necessary interface
+			   without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+			   This hack is not just for fun, it allows
+			   vic,vat and friends to work.
+			   They bind socket to loopback, set ttl to zero
+			   and expect that it will work.
+			   From the viewpoint of routing cache they are broken,
+			   because we are not allowed to build multicast path
+			   with loopback source addr (look, routing cache
+			   cannot know, that ttl is zero, so that packet
+			   will not leave this host and route is valid).
+			   Luckily, this hack is good workaround.
+			 */
+
+			fl.oif = dev_out->ifindex;
+			goto make_route;
+		}
+		if (dev_out)
+			dev_put(dev_out);
+		dev_out = NULL;
+	}
+
+
+	if (oldflp->oif) {
+		dev_out = dev_get_by_index(oldflp->oif);
+		err = -ENODEV;
+		if (dev_out == NULL)
+			goto out;
+		if (__in_dev_get(dev_out) == NULL) {
+			dev_put(dev_out);
+			goto out;	/* Wrong error code */
+		}
+
+		if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) {
+			if (!fl.fl4_src)
+				fl.fl4_src = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_LINK);
+			goto make_route;
+		}
+		if (!fl.fl4_src) {
+			if (MULTICAST(oldflp->fl4_dst))
+				fl.fl4_src = inet_select_addr(dev_out, 0,
+							      fl.fl4_scope);
+			else if (!oldflp->fl4_dst)
+				fl.fl4_src = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_HOST);
+		}
+	}
+
+	if (!fl.fl4_dst) {
+		fl.fl4_dst = fl.fl4_src;
+		if (!fl.fl4_dst)
+			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
+		if (dev_out)
+			dev_put(dev_out);
+		dev_out = &loopback_dev;
+		dev_hold(dev_out);
+		fl.oif = loopback_dev.ifindex;
+		res.type = RTN_LOCAL;
+		flags |= RTCF_LOCAL;
+		goto make_route;
+	}
+
+	if (fib_lookup(&fl, &res)) {
+		res.fi = NULL;
+		if (oldflp->oif) {
+			/* Apparently, routing tables are wrong. Assume,
+			   that the destination is on link.
+
+			   WHY? DW.
+			   Because we are allowed to send to iface
+			   even if it has NO routes and NO assigned
+			   addresses. When oif is specified, routing
+			   tables are looked up with only one purpose:
+			   to catch if destination is gatewayed, rather than
+			   direct. Moreover, if MSG_DONTROUTE is set,
+			   we send packet, ignoring both routing tables
+			   and ifaddr state. --ANK
+
+
+			   We could make it even if oif is unknown,
+			   likely IPv6, but we do not.
+			 */
+
+			if (fl.fl4_src == 0)
+				fl.fl4_src = inet_select_addr(dev_out, 0,
+							      RT_SCOPE_LINK);
+			res.type = RTN_UNICAST;
+			goto make_route;
+		}
+		if (dev_out)
+			dev_put(dev_out);
+		err = -ENETUNREACH;
+		goto out;
+	}
+	free_res = 1;
+
+	if (res.type == RTN_LOCAL) {
+		if (!fl.fl4_src)
+			fl.fl4_src = fl.fl4_dst;
+		if (dev_out)
+			dev_put(dev_out);
+		dev_out = &loopback_dev;
+		dev_hold(dev_out);
+		fl.oif = dev_out->ifindex;
+		if (res.fi)
+			fib_info_put(res.fi);
+		res.fi = NULL;
+		flags |= RTCF_LOCAL;
+		goto make_route;
+	}
+
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+	if (res.fi->fib_nhs > 1 && fl.oif == 0)
+		fib_select_multipath(&fl, &res);
+	else
+#endif
+	if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
+		fib_select_default(&fl, &res);
+
+	if (!fl.fl4_src)
+		fl.fl4_src = FIB_RES_PREFSRC(res);
+
+	if (dev_out)
+		dev_put(dev_out);
+	dev_out = FIB_RES_DEV(res);
+	dev_hold(dev_out);
+	fl.oif = dev_out->ifindex;
+
+
+make_route:
+	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
+
+
+	if (free_res)
+		fib_res_put(&res);
+	if (dev_out)
+		dev_put(dev_out);
+out:	return err;
+}
+
+int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
+{
+	unsigned hash;
+	struct rtable *rth;
+
+	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
+
+	rcu_read_lock_bh();
+	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
+		rth = rcu_dereference(rth->u.rt_next)) {
+		if (rth->fl.fl4_dst == flp->fl4_dst &&
+		    rth->fl.fl4_src == flp->fl4_src &&
+		    rth->fl.iif == 0 &&
+		    rth->fl.oif == flp->oif &&
+#ifdef CONFIG_IP_ROUTE_FWMARK
+		    rth->fl.fl4_fwmark == flp->fl4_fwmark &&
+#endif
+		    !((rth->fl.fl4_tos ^ flp->fl4_tos) &
+			    (IPTOS_RT_MASK | RTO_ONLINK))) {
+
+			/* check for multipath routes and choose one if
+			 * necessary
+			 */
+			if (multipath_select_route(flp, rth, rp)) {
+				dst_hold(&(*rp)->u.dst);
+				RT_CACHE_STAT_INC(out_hit);
+				rcu_read_unlock_bh();
+				return 0;
+			}
+
+			rth->u.dst.lastuse = jiffies;
+			dst_hold(&rth->u.dst);
+			rth->u.dst.__use++;
+			RT_CACHE_STAT_INC(out_hit);
+			rcu_read_unlock_bh();
+			*rp = rth;
+			return 0;
+		}
+		RT_CACHE_STAT_INC(out_hlist_search);
+	}
+	rcu_read_unlock_bh();
+
+	return ip_route_output_slow(rp, flp);
+}
+
+int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
+{
+	int err;
+
+	if ((err = __ip_route_output_key(rp, flp)) != 0)
+		return err;
+
+	if (flp->proto) {
+		if (!flp->fl4_src)
+			flp->fl4_src = (*rp)->rt_src;
+		if (!flp->fl4_dst)
+			flp->fl4_dst = (*rp)->rt_dst;
+		return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
+	}
+
+	return 0;
+}
+
+int ip_route_output_key(struct rtable **rp, struct flowi *flp)
+{
+	return ip_route_output_flow(rp, flp, NULL, 0);
+}
+
+static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+			int nowait)
+{
+	struct rtable *rt = (struct rtable*)skb->dst;
+	struct rtmsg *r;
+	struct nlmsghdr  *nlh;
+	unsigned char	 *b = skb->tail;
+	struct rta_cacheinfo ci;
+#ifdef CONFIG_IP_MROUTE
+	struct rtattr *eptr;
+#endif
+	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r));
+	r = NLMSG_DATA(nlh);
+	nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0;
+	r->rtm_family	 = AF_INET;
+	r->rtm_dst_len	= 32;
+	r->rtm_src_len	= 0;
+	r->rtm_tos	= rt->fl.fl4_tos;
+	r->rtm_table	= RT_TABLE_MAIN;
+	r->rtm_type	= rt->rt_type;
+	r->rtm_scope	= RT_SCOPE_UNIVERSE;
+	r->rtm_protocol = RTPROT_UNSPEC;
+	r->rtm_flags	= (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
+	if (rt->rt_flags & RTCF_NOTIFY)
+		r->rtm_flags |= RTM_F_NOTIFY;
+	RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst);
+	if (rt->fl.fl4_src) {
+		r->rtm_src_len = 32;
+		RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src);
+	}
+	if (rt->u.dst.dev)
+		RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex);
+#ifdef CONFIG_NET_CLS_ROUTE
+	if (rt->u.dst.tclassid)
+		RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid);
+#endif
+#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
+	if (rt->rt_multipath_alg != IP_MP_ALG_NONE) {
+		__u32 alg = rt->rt_multipath_alg;
+
+		RTA_PUT(skb, RTA_MP_ALGO, 4, &alg);
+	}
+#endif
+	if (rt->fl.iif)
+		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst);
+	else if (rt->rt_src != rt->fl.fl4_src)
+		RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src);
+	if (rt->rt_dst != rt->rt_gateway)
+		RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway);
+	if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
+		goto rtattr_failure;
+	ci.rta_lastuse	= jiffies_to_clock_t(jiffies - rt->u.dst.lastuse);
+	ci.rta_used	= rt->u.dst.__use;
+	ci.rta_clntref	= atomic_read(&rt->u.dst.__refcnt);
+	if (rt->u.dst.expires)
+		ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies);
+	else
+		ci.rta_expires = 0;
+	ci.rta_error	= rt->u.dst.error;
+	ci.rta_id	= ci.rta_ts = ci.rta_tsage = 0;
+	if (rt->peer) {
+		ci.rta_id = rt->peer->ip_id_count;
+		if (rt->peer->tcp_ts_stamp) {
+			ci.rta_ts = rt->peer->tcp_ts;
+			ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp;
+		}
+	}
+#ifdef CONFIG_IP_MROUTE
+	eptr = (struct rtattr*)skb->tail;
+#endif
+	RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci);
+	if (rt->fl.iif) {
+#ifdef CONFIG_IP_MROUTE
+		u32 dst = rt->rt_dst;
+
+		if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
+		    ipv4_devconf.mc_forwarding) {
+			int err = ipmr_get_route(skb, r, nowait);
+			if (err <= 0) {
+				if (!nowait) {
+					if (err == 0)
+						return 0;
+					goto nlmsg_failure;
+				} else {
+					if (err == -EMSGSIZE)
+						goto nlmsg_failure;
+					((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err;
+				}
+			}
+		} else
+#endif
+			RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif);
+	}
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+rtattr_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
+{
+	struct rtattr **rta = arg;
+	struct rtmsg *rtm = NLMSG_DATA(nlh);
+	struct rtable *rt = NULL;
+	u32 dst = 0;
+	u32 src = 0;
+	int iif = 0;
+	int err = -ENOBUFS;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!skb)
+		goto out;
+
+	/* Reserve room for dummy headers, this skb can pass
+	   through good chunk of routing engine.
+	 */
+	skb->mac.raw = skb->data;
+	skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
+
+	if (rta[RTA_SRC - 1])
+		memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4);
+	if (rta[RTA_DST - 1])
+		memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4);
+	if (rta[RTA_IIF - 1])
+		memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int));
+
+	if (iif) {
+		struct net_device *dev = __dev_get_by_index(iif);
+		err = -ENODEV;
+		if (!dev)
+			goto out_free;
+		skb->protocol	= htons(ETH_P_IP);
+		skb->dev	= dev;
+		local_bh_disable();
+		err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
+		local_bh_enable();
+		rt = (struct rtable*)skb->dst;
+		if (!err && rt->u.dst.error)
+			err = -rt->u.dst.error;
+	} else {
+		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst,
+							 .saddr = src,
+							 .tos = rtm->rtm_tos } } };
+		int oif = 0;
+		if (rta[RTA_OIF - 1])
+			memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int));
+		fl.oif = oif;
+		err = ip_route_output_key(&rt, &fl);
+	}
+	if (err)
+		goto out_free;
+
+	skb->dst = &rt->u.dst;
+	if (rtm->rtm_flags & RTM_F_NOTIFY)
+		rt->rt_flags |= RTCF_NOTIFY;
+
+	NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid;
+
+	err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+				RTM_NEWROUTE, 0);
+	if (!err)
+		goto out_free;
+	if (err < 0) {
+		err = -EMSGSIZE;
+		goto out_free;
+	}
+
+	err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+out:	return err;
+
+out_free:
+	kfree_skb(skb);
+	goto out;
+}
+
+int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
+{
+	struct rtable *rt;
+	int h, s_h;
+	int idx, s_idx;
+
+	s_h = cb->args[0];
+	s_idx = idx = cb->args[1];
+	for (h = 0; h <= rt_hash_mask; h++) {
+		if (h < s_h) continue;
+		if (h > s_h)
+			s_idx = 0;
+		rcu_read_lock_bh();
+		for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
+		     rt = rcu_dereference(rt->u.rt_next), idx++) {
+			if (idx < s_idx)
+				continue;
+			skb->dst = dst_clone(&rt->u.dst);
+			if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+					 cb->nlh->nlmsg_seq,
+					 RTM_NEWROUTE, 1) <= 0) {
+				dst_release(xchg(&skb->dst, NULL));
+				rcu_read_unlock_bh();
+				goto done;
+			}
+			dst_release(xchg(&skb->dst, NULL));
+		}
+		rcu_read_unlock_bh();
+	}
+
+done:
+	cb->args[0] = h;
+	cb->args[1] = idx;
+	return skb->len;
+}
+
+void ip_rt_multicast_event(struct in_device *in_dev)
+{
+	rt_cache_flush(0);
+}
+
+#ifdef CONFIG_SYSCTL
+static int flush_delay;
+
+static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	if (write) {
+		proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+		rt_cache_flush(flush_delay);
+		return 0;
+	} 
+
+	return -EINVAL;
+}
+
+static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
+						int __user *name,
+						int nlen,
+						void __user *oldval,
+						size_t __user *oldlenp,
+						void __user *newval,
+						size_t newlen,
+						void **context)
+{
+	int delay;
+	if (newlen != sizeof(int))
+		return -EINVAL;
+	if (get_user(delay, (int __user *)newval))
+		return -EFAULT; 
+	rt_cache_flush(delay); 
+	return 0;
+}
+
+ctl_table ipv4_route_table[] = {
+        {
+		.ctl_name 	= NET_IPV4_ROUTE_FLUSH,
+		.procname	= "flush",
+		.data		= &flush_delay,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ipv4_sysctl_rtcache_flush,
+		.strategy	= &ipv4_sysctl_rtcache_flush_strategy,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_MIN_DELAY,
+		.procname	= "min_delay",
+		.data		= &ip_rt_min_delay,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_MAX_DELAY,
+		.procname	= "max_delay",
+		.data		= &ip_rt_max_delay,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_THRESH,
+		.procname	= "gc_thresh",
+		.data		= &ipv4_dst_ops.gc_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_MAX_SIZE,
+		.procname	= "max_size",
+		.data		= &ip_rt_max_size,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		/*  Deprecated. Use gc_min_interval_ms */
+ 
+		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL,
+		.procname	= "gc_min_interval",
+		.data		= &ip_rt_gc_min_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
+		.procname	= "gc_min_interval_ms",
+		.data		= &ip_rt_gc_min_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_ms_jiffies,
+		.strategy	= &sysctl_ms_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_TIMEOUT,
+		.procname	= "gc_timeout",
+		.data		= &ip_rt_gc_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_INTERVAL,
+		.procname	= "gc_interval",
+		.data		= &ip_rt_gc_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_LOAD,
+		.procname	= "redirect_load",
+		.data		= &ip_rt_redirect_load,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_NUMBER,
+		.procname	= "redirect_number",
+		.data		= &ip_rt_redirect_number,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_REDIRECT_SILENCE,
+		.procname	= "redirect_silence",
+		.data		= &ip_rt_redirect_silence,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_ERROR_COST,
+		.procname	= "error_cost",
+		.data		= &ip_rt_error_cost,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_ERROR_BURST,
+		.procname	= "error_burst",
+		.data		= &ip_rt_error_burst,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_GC_ELASTICITY,
+		.procname	= "gc_elasticity",
+		.data		= &ip_rt_gc_elasticity,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_MTU_EXPIRES,
+		.procname	= "mtu_expires",
+		.data		= &ip_rt_mtu_expires,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_MIN_PMTU,
+		.procname	= "min_pmtu",
+		.data		= &ip_rt_min_pmtu,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_MIN_ADVMSS,
+		.procname	= "min_adv_mss",
+		.data		= &ip_rt_min_advmss,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE_SECRET_INTERVAL,
+		.procname	= "secret_interval",
+		.data		= &ip_rt_secret_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
+	},
+	{ .ctl_name = 0 }
+};
+#endif
+
+#ifdef CONFIG_NET_CLS_ROUTE
+struct ip_rt_acct *ip_rt_acct;
+
+/* This code sucks.  But you should have seen it before! --RR */
+
+/* IP route accounting ptr for this logical cpu number. */
+#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
+
+#ifdef CONFIG_PROC_FS
+static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
+			   int length, int *eof, void *data)
+{
+	unsigned int i;
+
+	if ((offset & 3) || (length & 3))
+		return -EIO;
+
+	if (offset >= sizeof(struct ip_rt_acct) * 256) {
+		*eof = 1;
+		return 0;
+	}
+
+	if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
+		length = sizeof(struct ip_rt_acct) * 256 - offset;
+		*eof = 1;
+	}
+
+	offset /= sizeof(u32);
+
+	if (length > 0) {
+		u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset;
+		u32 *dst = (u32 *) buffer;
+
+		/* Copy first cpu. */
+		*start = buffer;
+		memcpy(dst, src, length);
+
+		/* Add the other cpus in, one int at a time */
+		for_each_cpu(i) {
+			unsigned int j;
+
+			src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset;
+
+			for (j = 0; j < length/4; j++)
+				dst[j] += src[j];
+		}
+	}
+	return length;
+}
+#endif /* CONFIG_PROC_FS */
+#endif /* CONFIG_NET_CLS_ROUTE */
+
+static __initdata unsigned long rhash_entries;
+static int __init set_rhash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	rhash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("rhash_entries=", set_rhash_entries);
+
+int __init ip_rt_init(void)
+{
+	int i, order, goal, rc = 0;
+
+	rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
+			     (jiffies ^ (jiffies >> 7)));
+
+#ifdef CONFIG_NET_CLS_ROUTE
+	for (order = 0;
+	     (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++)
+		/* NOTHING */;
+	ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order);
+	if (!ip_rt_acct)
+		panic("IP: failed to allocate ip_rt_acct\n");
+	memset(ip_rt_acct, 0, PAGE_SIZE << order);
+#endif
+
+	ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache",
+						     sizeof(struct rtable),
+						     0, SLAB_HWCACHE_ALIGN,
+						     NULL, NULL);
+
+	if (!ipv4_dst_ops.kmem_cachep)
+		panic("IP: failed to allocate ip_dst_cache\n");
+
+	goal = num_physpages >> (26 - PAGE_SHIFT);
+	if (rhash_entries)
+		goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT;
+	for (order = 0; (1UL << order) < goal; order++)
+		/* NOTHING */;
+
+	do {
+		rt_hash_mask = (1UL << order) * PAGE_SIZE /
+			sizeof(struct rt_hash_bucket);
+		while (rt_hash_mask & (rt_hash_mask - 1))
+			rt_hash_mask--;
+		rt_hash_table = (struct rt_hash_bucket *)
+			__get_free_pages(GFP_ATOMIC, order);
+	} while (rt_hash_table == NULL && --order > 0);
+
+	if (!rt_hash_table)
+		panic("Failed to allocate IP route cache hash table\n");
+
+	printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n",
+	       rt_hash_mask,
+	       (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024);
+
+	for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++)
+		/* NOTHING */;
+
+	rt_hash_mask--;
+	for (i = 0; i <= rt_hash_mask; i++) {
+		spin_lock_init(&rt_hash_table[i].lock);
+		rt_hash_table[i].chain = NULL;
+	}
+
+	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
+	ip_rt_max_size = (rt_hash_mask + 1) * 16;
+
+	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
+	if (!rt_cache_stat)
+		return -ENOMEM;
+
+	devinet_init();
+	ip_fib_init();
+
+	init_timer(&rt_flush_timer);
+	rt_flush_timer.function = rt_run_flush;
+	init_timer(&rt_periodic_timer);
+	rt_periodic_timer.function = rt_check_expire;
+	init_timer(&rt_secret_timer);
+	rt_secret_timer.function = rt_secret_rebuild;
+
+	/* All the timers, started at system startup tend
+	   to synchronize. Perturb it a bit.
+	 */
+	rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval +
+					ip_rt_gc_interval;
+	add_timer(&rt_periodic_timer);
+
+	rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
+		ip_rt_secret_interval;
+	add_timer(&rt_secret_timer);
+
+#ifdef CONFIG_PROC_FS
+	{
+	struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
+	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
+	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
+			    		     proc_net_stat))) {
+		free_percpu(rt_cache_stat);
+		return -ENOMEM;
+	}
+	rtstat_pde->proc_fops = &rt_cpu_seq_fops;
+	}
+#ifdef CONFIG_NET_CLS_ROUTE
+	create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL);
+#endif
+#endif
+#ifdef CONFIG_XFRM
+	xfrm_init();
+	xfrm4_init();
+#endif
+	return rc;
+}
+
+EXPORT_SYMBOL(__ip_select_ident);
+EXPORT_SYMBOL(ip_route_input);
+EXPORT_SYMBOL(ip_route_output_key);
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
new file mode 100644
index 000000000000..e923d2f021aa
--- /dev/null
+++ b/net/ipv4/syncookies.c
@@ -0,0 +1,279 @@
+/*
+ *  Syncookies implementation for the Linux kernel
+ *
+ *  Copyright (C) 1997 Andi Kleen
+ *  Based on ideas by D.J.Bernstein and Eric Schenk. 
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ * 
+ *  $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $
+ *
+ *  Missing: IPv6 support. 
+ */
+
+#include <linux/tcp.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/cryptohash.h>
+#include <linux/kernel.h>
+#include <net/tcp.h>
+
+extern int sysctl_tcp_syncookies;
+
+static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS];
+
+static __init int init_syncookies(void)
+{
+	get_random_bytes(syncookie_secret, sizeof(syncookie_secret));
+	return 0;
+}
+module_init(init_syncookies);
+
+#define COOKIEBITS 24	/* Upper bits store count */
+#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
+
+static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport,
+		       u32 count, int c)
+{
+	__u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS];
+
+	memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c]));
+	tmp[0] = saddr;
+	tmp[1] = daddr;
+	tmp[2] = (sport << 16) + dport;
+	tmp[3] = count;
+	sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5);
+
+	return tmp[17];
+}
+
+static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport,
+				   __u16 dport, __u32 sseq, __u32 count,
+				   __u32 data)
+{
+	/*
+	 * Compute the secure sequence number.
+	 * The output should be:
+   	 *   HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24)
+	 *      + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24).
+	 * Where sseq is their sequence number and count increases every
+	 * minute by 1.
+	 * As an extra hack, we add a small "data" value that encodes the
+	 * MSS into the second hash value.
+	 */
+
+	return (cookie_hash(saddr, daddr, sport, dport, 0, 0) +
+		sseq + (count << COOKIEBITS) +
+		((cookie_hash(saddr, daddr, sport, dport, count, 1) + data)
+		 & COOKIEMASK));
+}
+
+/*
+ * This retrieves the small "data" value from the syncookie.
+ * If the syncookie is bad, the data returned will be out of
+ * range.  This must be checked by the caller.
+ *
+ * The count value used to generate the cookie must be within
+ * "maxdiff" if the current (passed-in) "count".  The return value
+ * is (__u32)-1 if this test fails.
+ */
+static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr,
+				  __u16 sport, __u16 dport, __u32 sseq,
+				  __u32 count, __u32 maxdiff)
+{
+	__u32 diff;
+
+	/* Strip away the layers from the cookie */
+	cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq;
+
+	/* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */
+	diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS);
+	if (diff >= maxdiff)
+		return (__u32)-1;
+
+	return (cookie -
+		cookie_hash(saddr, daddr, sport, dport, count - diff, 1))
+		& COOKIEMASK;	/* Leaving the data behind */
+}
+
+/* 
+ * This table has to be sorted and terminated with (__u16)-1.
+ * XXX generate a better table.
+ * Unresolved Issues: HIPPI with a 64k MSS is not well supported.
+ */
+static __u16 const msstab[] = {
+	64 - 1,
+	256 - 1,	
+	512 - 1,
+	536 - 1,
+	1024 - 1,	
+	1440 - 1,
+	1460 - 1,
+	4312 - 1,
+	(__u16)-1
+};
+/* The number doesn't include the -1 terminator */
+#define NUM_MSS (ARRAY_SIZE(msstab) - 1)
+
+/*
+ * Generate a syncookie.  mssp points to the mss, which is returned
+ * rounded down to the value encoded in the cookie.
+ */
+__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mssind;
+	const __u16 mss = *mssp;
+
+	
+	tp->last_synq_overflow = jiffies;
+
+	/* XXX sort msstab[] by probability?  Binary search? */
+	for (mssind = 0; mss > msstab[mssind + 1]; mssind++)
+		;
+	*mssp = msstab[mssind] + 1;
+
+	NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT);
+
+	return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr,
+				     skb->h.th->source, skb->h.th->dest,
+				     ntohl(skb->h.th->seq),
+				     jiffies / (HZ * 60), mssind);
+}
+
+/* 
+ * This (misnamed) value is the age of syncookie which is permitted.
+ * Its ideal value should be dependent on TCP_TIMEOUT_INIT and
+ * sysctl_tcp_retries1. It's a rather complicated formula (exponential
+ * backoff) to compute at runtime so it's currently hardcoded here.
+ */
+#define COUNTER_TRIES 4
+/*  
+ * Check if a ack sequence number is a valid syncookie. 
+ * Return the decoded mss if it is, or 0 if not.
+ */
+static inline int cookie_check(struct sk_buff *skb, __u32 cookie)
+{
+	__u32 seq; 
+	__u32 mssind;
+
+	seq = ntohl(skb->h.th->seq)-1; 
+	mssind = check_tcp_syn_cookie(cookie,
+				      skb->nh.iph->saddr, skb->nh.iph->daddr,
+				      skb->h.th->source, skb->h.th->dest,
+				      seq, jiffies / (HZ * 60), COUNTER_TRIES);
+
+	return mssind < NUM_MSS ? msstab[mssind] + 1 : 0;
+}
+
+extern struct or_calltable or_ipv4;
+
+static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
+					   struct open_request *req,
+					   struct dst_entry *dst)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sock *child;
+
+	child = tp->af_specific->syn_recv_sock(sk, skb, req, dst);
+	if (child)
+		tcp_acceptq_queue(sk, req, child);
+	else
+		tcp_openreq_free(req);
+
+	return child;
+}
+
+struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
+			     struct ip_options *opt)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 cookie = ntohl(skb->h.th->ack_seq) - 1; 
+	struct sock *ret = sk;
+	struct open_request *req; 
+	int mss; 
+	struct rtable *rt; 
+	__u8 rcv_wscale;
+
+	if (!sysctl_tcp_syncookies || !skb->h.th->ack)
+		goto out;
+
+  	if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) ||
+	    (mss = cookie_check(skb, cookie)) == 0) {
+	 	NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED);
+		goto out;
+	}
+
+	NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV);
+
+	req = tcp_openreq_alloc();
+	ret = NULL;
+	if (!req)
+		goto out;
+
+	req->rcv_isn		= htonl(skb->h.th->seq) - 1;
+	req->snt_isn		= cookie; 
+	req->mss		= mss;
+ 	req->rmt_port		= skb->h.th->source;
+	req->af.v4_req.loc_addr = skb->nh.iph->daddr;
+	req->af.v4_req.rmt_addr = skb->nh.iph->saddr;
+	req->class		= &or_ipv4; /* for savety */
+	req->af.v4_req.opt	= NULL;
+
+	/* We throwed the options of the initial SYN away, so we hope
+	 * the ACK carries the same options again (see RFC1122 4.2.3.8)
+	 */
+	if (opt && opt->optlen) {
+		int opt_size = sizeof(struct ip_options) + opt->optlen;
+
+		req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC);
+		if (req->af.v4_req.opt) {
+			if (ip_options_echo(req->af.v4_req.opt, skb)) {
+				kfree(req->af.v4_req.opt);
+				req->af.v4_req.opt = NULL;
+			}
+		}
+	}
+
+	req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0;
+	req->wscale_ok	= req->sack_ok = 0; 
+	req->expires	= 0UL; 
+	req->retrans	= 0; 
+	
+	/*
+	 * We need to lookup the route here to get at the correct
+	 * window size. We should better make sure that the window size
+	 * hasn't changed since we received the original syn, but I see
+	 * no easy way to do this. 
+	 */
+	{
+		struct flowi fl = { .nl_u = { .ip4_u =
+					      { .daddr = ((opt && opt->srr) ?
+							  opt->faddr :
+							  req->af.v4_req.rmt_addr),
+						.saddr = req->af.v4_req.loc_addr,
+						.tos = RT_CONN_FLAGS(sk) } },
+				    .proto = IPPROTO_TCP,
+				    .uli_u = { .ports =
+					       { .sport = skb->h.th->dest,
+						 .dport = skb->h.th->source } } };
+		if (ip_route_output_key(&rt, &fl)) {
+			tcp_openreq_free(req);
+			goto out; 
+		}
+	}
+
+	/* Try to redo what tcp_v4_send_synack did. */
+	req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW);
+	tcp_select_initial_window(tcp_full_space(sk), req->mss,
+				  &req->rcv_wnd, &req->window_clamp, 
+				  0, &rcv_wscale);
+	/* BTW win scale with syncookies is 0 by definition */
+	req->rcv_wscale	  = rcv_wscale; 
+
+	ret = get_cookie_sock(sk, skb, req, &rt->u.dst);
+out:	return ret;
+}
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
new file mode 100644
index 000000000000..3aafb298c1c1
--- /dev/null
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -0,0 +1,698 @@
+/*
+ * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem.
+ *
+ * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $
+ *
+ * Begun April 1, 1996, Mike Shaver.
+ * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS]
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/config.h>
+#include <net/snmp.h>
+#include <net/ip.h>
+#include <net/route.h>
+#include <net/tcp.h>
+
+/* From af_inet.c */
+extern int sysctl_ip_nonlocal_bind;
+
+/* From icmp.c */
+extern int sysctl_icmp_echo_ignore_all;
+extern int sysctl_icmp_echo_ignore_broadcasts;
+extern int sysctl_icmp_ignore_bogus_error_responses;
+
+/* From ip_fragment.c */
+extern int sysctl_ipfrag_low_thresh;
+extern int sysctl_ipfrag_high_thresh; 
+extern int sysctl_ipfrag_time;
+extern int sysctl_ipfrag_secret_interval;
+
+/* From ip_output.c */
+extern int sysctl_ip_dynaddr;
+
+/* From icmp.c */
+extern int sysctl_icmp_ratelimit;
+extern int sysctl_icmp_ratemask;
+
+/* From igmp.c */
+extern int sysctl_igmp_max_memberships;
+extern int sysctl_igmp_max_msf;
+
+/* From inetpeer.c */
+extern int inet_peer_threshold;
+extern int inet_peer_minttl;
+extern int inet_peer_maxttl;
+extern int inet_peer_gc_mintime;
+extern int inet_peer_gc_maxtime;
+
+#ifdef CONFIG_SYSCTL
+static int tcp_retr1_max = 255; 
+static int ip_local_port_range_min[] = { 1, 1 };
+static int ip_local_port_range_max[] = { 65535, 65535 };
+#endif
+
+struct ipv4_config ipv4_config;
+
+extern ctl_table ipv4_route_table[];
+
+#ifdef CONFIG_SYSCTL
+
+static
+int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
+			void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int val = ipv4_devconf.forwarding;
+	int ret;
+
+	ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
+
+	if (write && ipv4_devconf.forwarding != val)
+		inet_forward_change();
+
+	return ret;
+}
+
+static int ipv4_sysctl_forward_strategy(ctl_table *table,
+			 int __user *name, int nlen,
+			 void __user *oldval, size_t __user *oldlenp,
+			 void __user *newval, size_t newlen, 
+			 void **context)
+{
+	int *valp = table->data;
+	int new;
+
+	if (!newval || !newlen)
+		return 0;
+
+	if (newlen != sizeof(int))
+		return -EINVAL;
+
+	if (get_user(new, (int __user *)newval))
+		return -EFAULT;
+
+	if (new == *valp)
+		return 0;
+
+	if (oldval && oldlenp) {
+		size_t len;
+
+		if (get_user(len, oldlenp))
+			return -EFAULT;
+
+		if (len) {
+			if (len > table->maxlen)
+				len = table->maxlen;
+			if (copy_to_user(oldval, valp, len))
+				return -EFAULT;
+			if (put_user(len, oldlenp))
+				return -EFAULT;
+		}
+	}
+
+	*valp = new;
+	inet_forward_change();
+	return 1;
+}
+
+ctl_table ipv4_table[] = {
+        {
+		.ctl_name	= NET_IPV4_TCP_TIMESTAMPS,
+		.procname	= "tcp_timestamps",
+		.data		= &sysctl_tcp_timestamps,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+        {
+		.ctl_name	= NET_IPV4_TCP_WINDOW_SCALING,
+		.procname	= "tcp_window_scaling",
+		.data		= &sysctl_tcp_window_scaling,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+        {
+		.ctl_name	= NET_IPV4_TCP_SACK,
+		.procname	= "tcp_sack",
+		.data		= &sysctl_tcp_sack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+        {
+		.ctl_name	= NET_IPV4_TCP_RETRANS_COLLAPSE,
+		.procname	= "tcp_retrans_collapse",
+		.data		= &sysctl_tcp_retrans_collapse,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+        {
+		.ctl_name	= NET_IPV4_FORWARD,
+		.procname	= "ip_forward",
+		.data		= &ipv4_devconf.forwarding,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ipv4_sysctl_forward,
+		.strategy	= &ipv4_sysctl_forward_strategy
+	},
+        {
+		.ctl_name	= NET_IPV4_DEFAULT_TTL,
+		.procname	= "ip_default_ttl",
+ 		.data		= &sysctl_ip_default_ttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &ipv4_doint_and_flush,
+		.strategy	= &ipv4_doint_and_flush_strategy,
+	},
+        {
+		.ctl_name	= NET_IPV4_AUTOCONFIG,
+		.procname	= "ip_autoconfig",
+		.data		= &ipv4_config.autoconfig,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+        {
+		.ctl_name	= NET_IPV4_NO_PMTU_DISC,
+		.procname	= "ip_no_pmtu_disc",
+		.data		= &ipv4_config.no_pmtu_disc,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_NONLOCAL_BIND,
+		.procname	= "ip_nonlocal_bind",
+		.data		= &sysctl_ip_nonlocal_bind,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_SYN_RETRIES,
+		.procname	= "tcp_syn_retries",
+		.data		= &sysctl_tcp_syn_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_SYNACK_RETRIES,
+		.procname	= "tcp_synack_retries",
+		.data		= &sysctl_tcp_synack_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_MAX_ORPHANS,
+		.procname	= "tcp_max_orphans",
+		.data		= &sysctl_tcp_max_orphans,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_MAX_TW_BUCKETS,
+		.procname	= "tcp_max_tw_buckets",
+		.data		= &sysctl_tcp_max_tw_buckets,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_HIGH_THRESH,
+		.procname	= "ipfrag_high_thresh",
+		.data		= &sysctl_ipfrag_high_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_LOW_THRESH,
+		.procname	= "ipfrag_low_thresh",
+		.data		= &sysctl_ipfrag_low_thresh,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_DYNADDR,
+		.procname	= "ip_dynaddr",
+		.data		= &sysctl_ip_dynaddr,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_TIME,
+		.procname	= "ipfrag_time",
+		.data		= &sysctl_ipfrag_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_KEEPALIVE_TIME,
+		.procname	= "tcp_keepalive_time",
+		.data		= &sysctl_tcp_keepalive_time,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_KEEPALIVE_PROBES,
+		.procname	= "tcp_keepalive_probes",
+		.data		= &sysctl_tcp_keepalive_probes,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_KEEPALIVE_INTVL,
+		.procname	= "tcp_keepalive_intvl",
+		.data		= &sysctl_tcp_keepalive_intvl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_RETRIES1,
+		.procname	= "tcp_retries1",
+		.data		= &sysctl_tcp_retries1,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra2		= &tcp_retr1_max
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_RETRIES2,
+		.procname	= "tcp_retries2",
+		.data		= &sysctl_tcp_retries2,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_TCP_FIN_TIMEOUT,
+		.procname	= "tcp_fin_timeout",
+		.data		= &sysctl_tcp_fin_timeout,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+#ifdef CONFIG_SYN_COOKIES
+	{
+		.ctl_name	= NET_TCP_SYNCOOKIES,
+		.procname	= "tcp_syncookies",
+		.data		= &sysctl_tcp_syncookies,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+#endif
+	{
+		.ctl_name	= NET_TCP_TW_RECYCLE,
+		.procname	= "tcp_tw_recycle",
+		.data		= &sysctl_tcp_tw_recycle,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_ABORT_ON_OVERFLOW,
+		.procname	= "tcp_abort_on_overflow",
+		.data		= &sysctl_tcp_abort_on_overflow,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_STDURG,
+		.procname	= "tcp_stdurg",
+		.data		= &sysctl_tcp_stdurg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_RFC1337,
+		.procname	= "tcp_rfc1337",
+		.data		= &sysctl_tcp_rfc1337,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_MAX_SYN_BACKLOG,
+		.procname	= "tcp_max_syn_backlog",
+		.data		= &sysctl_max_syn_backlog,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_LOCAL_PORT_RANGE,
+		.procname	= "ip_local_port_range",
+		.data		= &sysctl_local_port_range,
+		.maxlen		= sizeof(sysctl_local_port_range),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= ip_local_port_range_min,
+		.extra2		= ip_local_port_range_max
+	},
+	{
+		.ctl_name	= NET_IPV4_ICMP_ECHO_IGNORE_ALL,
+		.procname	= "icmp_echo_ignore_all",
+		.data		= &sysctl_icmp_echo_ignore_all,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
+		.procname	= "icmp_echo_ignore_broadcasts",
+		.data		= &sysctl_icmp_echo_ignore_broadcasts,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
+		.procname	= "icmp_ignore_bogus_error_responses",
+		.data		= &sysctl_icmp_ignore_bogus_error_responses,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_ROUTE,
+		.procname	= "route",
+		.maxlen		= 0,
+		.mode		= 0555,
+		.child		= ipv4_route_table
+	},
+#ifdef CONFIG_IP_MULTICAST
+	{
+		.ctl_name	= NET_IPV4_IGMP_MAX_MEMBERSHIPS,
+		.procname	= "igmp_max_memberships",
+		.data		= &sysctl_igmp_max_memberships,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+
+#endif
+	{
+		.ctl_name	= NET_IPV4_IGMP_MAX_MSF,
+		.procname	= "igmp_max_msf",
+		.data		= &sysctl_igmp_max_msf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_INET_PEER_THRESHOLD,
+		.procname	= "inet_peer_threshold",
+		.data		= &inet_peer_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_INET_PEER_MINTTL,
+		.procname	= "inet_peer_minttl",
+		.data		= &inet_peer_minttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_IPV4_INET_PEER_MAXTTL,
+		.procname	= "inet_peer_maxttl",
+		.data		= &inet_peer_maxttl,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_IPV4_INET_PEER_GC_MINTIME,
+		.procname	= "inet_peer_gc_mintime",
+		.data		= &inet_peer_gc_mintime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_IPV4_INET_PEER_GC_MAXTIME,
+		.procname	= "inet_peer_gc_maxtime",
+		.data		= &inet_peer_gc_maxtime,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_TCP_ORPHAN_RETRIES,
+		.procname	= "tcp_orphan_retries",
+		.data		= &sysctl_tcp_orphan_retries,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_FACK,
+		.procname	= "tcp_fack",
+		.data		= &sysctl_tcp_fack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_REORDERING,
+		.procname	= "tcp_reordering",
+		.data		= &sysctl_tcp_reordering,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_ECN,
+		.procname	= "tcp_ecn",
+		.data		= &sysctl_tcp_ecn,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_DSACK,
+		.procname	= "tcp_dsack",
+		.data		= &sysctl_tcp_dsack,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_MEM,
+		.procname	= "tcp_mem",
+		.data		= &sysctl_tcp_mem,
+		.maxlen		= sizeof(sysctl_tcp_mem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_WMEM,
+		.procname	= "tcp_wmem",
+		.data		= &sysctl_tcp_wmem,
+		.maxlen		= sizeof(sysctl_tcp_wmem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_RMEM,
+		.procname	= "tcp_rmem",
+		.data		= &sysctl_tcp_rmem,
+		.maxlen		= sizeof(sysctl_tcp_rmem),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_APP_WIN,
+		.procname	= "tcp_app_win",
+		.data		= &sysctl_tcp_app_win,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_ADV_WIN_SCALE,
+		.procname	= "tcp_adv_win_scale",
+		.data		= &sysctl_tcp_adv_win_scale,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_ICMP_RATELIMIT,
+		.procname	= "icmp_ratelimit",
+		.data		= &sysctl_icmp_ratelimit,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_ICMP_RATEMASK,
+		.procname	= "icmp_ratemask",
+		.data		= &sysctl_icmp_ratemask,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_TW_REUSE,
+		.procname	= "tcp_tw_reuse",
+		.data		= &sysctl_tcp_tw_reuse,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_FRTO,
+		.procname	= "tcp_frto",
+		.data		= &sysctl_tcp_frto,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_TCP_LOW_LATENCY,
+		.procname	= "tcp_low_latency",
+		.data		= &sysctl_tcp_low_latency,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= NET_IPV4_IPFRAG_SECRET_INTERVAL,
+		.procname	= "ipfrag_secret_interval",
+		.data		= &sysctl_ipfrag_secret_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies
+	},
+	{
+		.ctl_name	= NET_TCP_NO_METRICS_SAVE,
+		.procname	= "tcp_no_metrics_save",
+		.data		= &sysctl_tcp_nometrics_save,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_WESTWOOD, 
+		.procname	= "tcp_westwood",
+		.data		= &sysctl_tcp_westwood,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_VEGAS,
+		.procname	= "tcp_vegas_cong_avoid",
+		.data		= &sysctl_tcp_vegas_cong_avoid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_VEGAS_ALPHA,
+		.procname	= "tcp_vegas_alpha",
+		.data		= &sysctl_tcp_vegas_alpha,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_VEGAS_BETA,
+		.procname	= "tcp_vegas_beta",
+		.data		= &sysctl_tcp_vegas_beta,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_VEGAS_GAMMA,
+		.procname	= "tcp_vegas_gamma",
+		.data		= &sysctl_tcp_vegas_gamma,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BIC,
+		.procname	= "tcp_bic",
+		.data		= &sysctl_tcp_bic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BIC_FAST_CONVERGENCE,
+		.procname	= "tcp_bic_fast_convergence",
+		.data		= &sysctl_tcp_bic_fast_convergence,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BIC_LOW_WINDOW,
+		.procname	= "tcp_bic_low_window",
+		.data		= &sysctl_tcp_bic_low_window,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_MODERATE_RCVBUF,
+		.procname	= "tcp_moderate_rcvbuf",
+		.data		= &sysctl_tcp_moderate_rcvbuf,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_TSO_WIN_DIVISOR,
+		.procname	= "tcp_tso_win_divisor",
+		.data		= &sysctl_tcp_tso_win_divisor,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= NET_TCP_BIC_BETA,
+		.procname	= "tcp_bic_beta",
+		.data		= &sysctl_tcp_bic_beta,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{ .ctl_name = 0 }
+};
+
+#endif /* CONFIG_SYSCTL */
+
+EXPORT_SYMBOL(ipv4_config);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
new file mode 100644
index 000000000000..5cff56af7855
--- /dev/null
+++ b/net/ipv4/tcp.c
@@ -0,0 +1,2386 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ *
+ * Fixes:
+ *		Alan Cox	:	Numerous verify_area() calls
+ *		Alan Cox	:	Set the ACK bit on a reset
+ *		Alan Cox	:	Stopped it crashing if it closed while
+ *					sk->inuse=1 and was trying to connect
+ *					(tcp_err()).
+ *		Alan Cox	:	All icmp error handling was broken
+ *					pointers passed where wrong and the
+ *					socket was looked up backwards. Nobody
+ *					tested any icmp error code obviously.
+ *		Alan Cox	:	tcp_err() now handled properly. It
+ *					wakes people on errors. poll
+ *					behaves and the icmp error race
+ *					has gone by moving it into sock.c
+ *		Alan Cox	:	tcp_send_reset() fixed to work for
+ *					everything not just packets for
+ *					unknown sockets.
+ *		Alan Cox	:	tcp option processing.
+ *		Alan Cox	:	Reset tweaked (still not 100%) [Had
+ *					syn rule wrong]
+ *		Herp Rosmanith  :	More reset fixes
+ *		Alan Cox	:	No longer acks invalid rst frames.
+ *					Acking any kind of RST is right out.
+ *		Alan Cox	:	Sets an ignore me flag on an rst
+ *					receive otherwise odd bits of prattle
+ *					escape still
+ *		Alan Cox	:	Fixed another acking RST frame bug.
+ *					Should stop LAN workplace lockups.
+ *		Alan Cox	: 	Some tidyups using the new skb list
+ *					facilities
+ *		Alan Cox	:	sk->keepopen now seems to work
+ *		Alan Cox	:	Pulls options out correctly on accepts
+ *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
+ *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
+ *					bit to skb ops.
+ *		Alan Cox	:	Tidied tcp_data to avoid a potential
+ *					nasty.
+ *		Alan Cox	:	Added some better commenting, as the
+ *					tcp is hard to follow
+ *		Alan Cox	:	Removed incorrect check for 20 * psh
+ *	Michael O'Reilly	:	ack < copied bug fix.
+ *	Johannes Stille		:	Misc tcp fixes (not all in yet).
+ *		Alan Cox	:	FIN with no memory -> CRASH
+ *		Alan Cox	:	Added socket option proto entries.
+ *					Also added awareness of them to accept.
+ *		Alan Cox	:	Added TCP options (SOL_TCP)
+ *		Alan Cox	:	Switched wakeup calls to callbacks,
+ *					so the kernel can layer network
+ *					sockets.
+ *		Alan Cox	:	Use ip_tos/ip_ttl settings.
+ *		Alan Cox	:	Handle FIN (more) properly (we hope).
+ *		Alan Cox	:	RST frames sent on unsynchronised
+ *					state ack error.
+ *		Alan Cox	:	Put in missing check for SYN bit.
+ *		Alan Cox	:	Added tcp_select_window() aka NET2E
+ *					window non shrink trick.
+ *		Alan Cox	:	Added a couple of small NET2E timer
+ *					fixes
+ *		Charles Hedrick :	TCP fixes
+ *		Toomas Tamm	:	TCP window fixes
+ *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
+ *		Charles Hedrick	:	Rewrote most of it to actually work
+ *		Linus		:	Rewrote tcp_read() and URG handling
+ *					completely
+ *		Gerhard Koerting:	Fixed some missing timer handling
+ *		Matthew Dillon  :	Reworked TCP machine states as per RFC
+ *		Gerhard Koerting:	PC/TCP workarounds
+ *		Adam Caldwell	:	Assorted timer/timing errors
+ *		Matthew Dillon	:	Fixed another RST bug
+ *		Alan Cox	:	Move to kernel side addressing changes.
+ *		Alan Cox	:	Beginning work on TCP fastpathing
+ *					(not yet usable)
+ *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
+ *		Alan Cox	:	TCP fast path debugging
+ *		Alan Cox	:	Window clamping
+ *		Michael Riepe	:	Bug in tcp_check()
+ *		Matt Dillon	:	More TCP improvements and RST bug fixes
+ *		Matt Dillon	:	Yet more small nasties remove from the
+ *					TCP code (Be very nice to this man if
+ *					tcp finally works 100%) 8)
+ *		Alan Cox	:	BSD accept semantics.
+ *		Alan Cox	:	Reset on closedown bug.
+ *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
+ *		Michael Pall	:	Handle poll() after URG properly in
+ *					all cases.
+ *		Michael Pall	:	Undo the last fix in tcp_read_urg()
+ *					(multi URG PUSH broke rlogin).
+ *		Michael Pall	:	Fix the multi URG PUSH problem in
+ *					tcp_readable(), poll() after URG
+ *					works now.
+ *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
+ *					BSD api.
+ *		Alan Cox	:	Changed the semantics of sk->socket to
+ *					fix a race and a signal problem with
+ *					accept() and async I/O.
+ *		Alan Cox	:	Relaxed the rules on tcp_sendto().
+ *		Yury Shevchuk	:	Really fixed accept() blocking problem.
+ *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
+ *					clients/servers which listen in on
+ *					fixed ports.
+ *		Alan Cox	:	Cleaned the above up and shrank it to
+ *					a sensible code size.
+ *		Alan Cox	:	Self connect lockup fix.
+ *		Alan Cox	:	No connect to multicast.
+ *		Ross Biro	:	Close unaccepted children on master
+ *					socket close.
+ *		Alan Cox	:	Reset tracing code.
+ *		Alan Cox	:	Spurious resets on shutdown.
+ *		Alan Cox	:	Giant 15 minute/60 second timer error
+ *		Alan Cox	:	Small whoops in polling before an
+ *					accept.
+ *		Alan Cox	:	Kept the state trace facility since
+ *					it's handy for debugging.
+ *		Alan Cox	:	More reset handler fixes.
+ *		Alan Cox	:	Started rewriting the code based on
+ *					the RFC's for other useful protocol
+ *					references see: Comer, KA9Q NOS, and
+ *					for a reference on the difference
+ *					between specifications and how BSD
+ *					works see the 4.4lite source.
+ *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
+ *					close.
+ *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
+ *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
+ *		Alan Cox	:	Reimplemented timers as per the RFC
+ *					and using multiple timers for sanity.
+ *		Alan Cox	:	Small bug fixes, and a lot of new
+ *					comments.
+ *		Alan Cox	:	Fixed dual reader crash by locking
+ *					the buffers (much like datagram.c)
+ *		Alan Cox	:	Fixed stuck sockets in probe. A probe
+ *					now gets fed up of retrying without
+ *					(even a no space) answer.
+ *		Alan Cox	:	Extracted closing code better
+ *		Alan Cox	:	Fixed the closing state machine to
+ *					resemble the RFC.
+ *		Alan Cox	:	More 'per spec' fixes.
+ *		Jorge Cwik	:	Even faster checksumming.
+ *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
+ *					only frames. At least one pc tcp stack
+ *					generates them.
+ *		Alan Cox	:	Cache last socket.
+ *		Alan Cox	:	Per route irtt.
+ *		Matt Day	:	poll()->select() match BSD precisely on error
+ *		Alan Cox	:	New buffers
+ *		Marc Tamsky	:	Various sk->prot->retransmits and
+ *					sk->retransmits misupdating fixed.
+ *					Fixed tcp_write_timeout: stuck close,
+ *					and TCP syn retries gets used now.
+ *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
+ *					ack if state is TCP_CLOSED.
+ *		Alan Cox	:	Look up device on a retransmit - routes may
+ *					change. Doesn't yet cope with MSS shrink right
+ *					but it's a start!
+ *		Marc Tamsky	:	Closing in closing fixes.
+ *		Mike Shaver	:	RFC1122 verifications.
+ *		Alan Cox	:	rcv_saddr errors.
+ *		Alan Cox	:	Block double connect().
+ *		Alan Cox	:	Small hooks for enSKIP.
+ *		Alexey Kuznetsov:	Path MTU discovery.
+ *		Alan Cox	:	Support soft errors.
+ *		Alan Cox	:	Fix MTU discovery pathological case
+ *					when the remote claims no mtu!
+ *		Marc Tamsky	:	TCP_CLOSE fix.
+ *		Colin (G3TNE)	:	Send a reset on syn ack replies in
+ *					window but wrong (fixes NT lpd problems)
+ *		Pedro Roque	:	Better TCP window handling, delayed ack.
+ *		Joerg Reuter	:	No modification of locked buffers in
+ *					tcp_do_retransmit()
+ *		Eric Schenk	:	Changed receiver side silly window
+ *					avoidance algorithm to BSD style
+ *					algorithm. This doubles throughput
+ *					against machines running Solaris,
+ *					and seems to result in general
+ *					improvement.
+ *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *	Mike McLagan		:	Routing by source
+ *		Keith Owens	:	Do proper merging with partial SKB's in
+ *					tcp_do_sendmsg to avoid burstiness.
+ *		Eric Schenk	:	Fix fast close down bug with
+ *					shutdown() followed by close().
+ *		Andi Kleen 	:	Make poll agree with SIGIO
+ *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
+ *					lingertime == 0 (RFC 793 ABORT Call)
+ *	Hirokazu Takahashi	:	Use copy_from_user() instead of
+ *					csum_and_copy_from_user() if possible.
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or(at your option) any later version.
+ *
+ * Description of States:
+ *
+ *	TCP_SYN_SENT		sent a connection request, waiting for ack
+ *
+ *	TCP_SYN_RECV		received a connection request, sent ack,
+ *				waiting for final ack in three-way handshake.
+ *
+ *	TCP_ESTABLISHED		connection established
+ *
+ *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
+ *				transmission of remaining buffered data
+ *
+ *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
+ *				to shutdown
+ *
+ *	TCP_CLOSING		both sides have shutdown but we still have
+ *				data we have to finish sending
+ *
+ *	TCP_TIME_WAIT		timeout to catch resent junk before entering
+ *				closed, can only be entered from FIN_WAIT2
+ *				or CLOSING.  Required because the other end
+ *				may not have gotten our last ACK causing it
+ *				to retransmit the data packet (which we ignore)
+ *
+ *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
+ *				us to finish writing our data and to shutdown
+ *				(we have to close() to move on to LAST_ACK)
+ *
+ *	TCP_LAST_ACK		out side has shutdown after remote has
+ *				shutdown.  There may still be data in our
+ *				buffer that we have to finish sending
+ *
+ *	TCP_CLOSE		socket is finished
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/smp_lock.h>
+#include <linux/fs.h>
+#include <linux/random.h>
+#include <linux/bootmem.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+
+int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+
+DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics);
+
+kmem_cache_t *tcp_openreq_cachep;
+kmem_cache_t *tcp_bucket_cachep;
+kmem_cache_t *tcp_timewait_cachep;
+
+atomic_t tcp_orphan_count = ATOMIC_INIT(0);
+
+int sysctl_tcp_mem[3];
+int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 };
+int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 };
+
+EXPORT_SYMBOL(sysctl_tcp_mem);
+EXPORT_SYMBOL(sysctl_tcp_rmem);
+EXPORT_SYMBOL(sysctl_tcp_wmem);
+
+atomic_t tcp_memory_allocated;	/* Current allocated memory. */
+atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
+
+EXPORT_SYMBOL(tcp_memory_allocated);
+EXPORT_SYMBOL(tcp_sockets_allocated);
+
+/*
+ * Pressure flag: try to collapse.
+ * Technical note: it is used by multiple contexts non atomically.
+ * All the sk_stream_mem_schedule() is of this nature: accounting
+ * is strict, actions are advisory and have some latency.
+ */
+int tcp_memory_pressure;
+
+EXPORT_SYMBOL(tcp_memory_pressure);
+
+void tcp_enter_memory_pressure(void)
+{
+	if (!tcp_memory_pressure) {
+		NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES);
+		tcp_memory_pressure = 1;
+	}
+}
+
+EXPORT_SYMBOL(tcp_enter_memory_pressure);
+
+/*
+ * LISTEN is a special case for poll..
+ */
+static __inline__ unsigned int tcp_listen_poll(struct sock *sk,
+					       poll_table *wait)
+{
+	return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0;
+}
+
+/*
+ *	Wait for a TCP event.
+ *
+ *	Note that we don't need to lock the socket, as the upper poll layers
+ *	take care of normal races (between the test and the event) and we don't
+ *	go look at any of the socket buffers directly.
+ */
+unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	unsigned int mask;
+	struct sock *sk = sock->sk;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	poll_wait(file, sk->sk_sleep, wait);
+	if (sk->sk_state == TCP_LISTEN)
+		return tcp_listen_poll(sk, wait);
+
+	/* Socket is not locked. We are protected from async events
+	   by poll logic and correct handling of state changes
+	   made by another threads is impossible in any case.
+	 */
+
+	mask = 0;
+	if (sk->sk_err)
+		mask = POLLERR;
+
+	/*
+	 * POLLHUP is certainly not done right. But poll() doesn't
+	 * have a notion of HUP in just one direction, and for a
+	 * socket the read side is more interesting.
+	 *
+	 * Some poll() documentation says that POLLHUP is incompatible
+	 * with the POLLOUT/POLLWR flags, so somebody should check this
+	 * all. But careful, it tends to be safer to return too many
+	 * bits than too few, and you can easily break real applications
+	 * if you don't tell them that something has hung up!
+	 *
+	 * Check-me.
+	 *
+	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
+	 * our fs/select.c). It means that after we received EOF,
+	 * poll always returns immediately, making impossible poll() on write()
+	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
+	 * if and only if shutdown has been made in both directions.
+	 * Actually, it is interesting to look how Solaris and DUX
+	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
+	 * then we could set it on SND_SHUTDOWN. BTW examples given
+	 * in Stevens' books assume exactly this behaviour, it explains
+	 * why PULLHUP is incompatible with POLLOUT.	--ANK
+	 *
+	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
+	 * blocking on fresh not-connected or disconnected socket. --ANK
+	 */
+	if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+		mask |= POLLHUP;
+	if (sk->sk_shutdown & RCV_SHUTDOWN)
+		mask |= POLLIN | POLLRDNORM;
+
+	/* Connected? */
+	if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		/* Potential race condition. If read of tp below will
+		 * escape above sk->sk_state, we can be illegally awaken
+		 * in SYN_* states. */
+		if ((tp->rcv_nxt != tp->copied_seq) &&
+		    (tp->urg_seq != tp->copied_seq ||
+		     tp->rcv_nxt != tp->copied_seq + 1 ||
+		     sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data))
+			mask |= POLLIN | POLLRDNORM;
+
+		if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+			if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+				mask |= POLLOUT | POLLWRNORM;
+			} else {  /* send SIGIO later */
+				set_bit(SOCK_ASYNC_NOSPACE,
+					&sk->sk_socket->flags);
+				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+				/* Race breaker. If space is freed after
+				 * wspace test but before the flags are set,
+				 * IO signal will be lost.
+				 */
+				if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+					mask |= POLLOUT | POLLWRNORM;
+			}
+		}
+
+		if (tp->urg_data & TCP_URG_VALID)
+			mask |= POLLPRI;
+	}
+	return mask;
+}
+
+int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int answ;
+
+	switch (cmd) {
+	case SIOCINQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		lock_sock(sk);
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else if (sock_flag(sk, SOCK_URGINLINE) ||
+			 !tp->urg_data ||
+			 before(tp->urg_seq, tp->copied_seq) ||
+			 !before(tp->urg_seq, tp->rcv_nxt)) {
+			answ = tp->rcv_nxt - tp->copied_seq;
+
+			/* Subtract 1, if FIN is in queue. */
+			if (answ && !skb_queue_empty(&sk->sk_receive_queue))
+				answ -=
+		       ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin;
+		} else
+			answ = tp->urg_seq - tp->copied_seq;
+		release_sock(sk);
+		break;
+	case SIOCATMARK:
+		answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
+		break;
+	case SIOCOUTQ:
+		if (sk->sk_state == TCP_LISTEN)
+			return -EINVAL;
+
+		if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
+			answ = 0;
+		else
+			answ = tp->write_seq - tp->snd_una;
+		break;
+	default:
+		return -ENOIOCTLCMD;
+	};
+
+	return put_user(answ, (int __user *)arg);
+}
+
+
+int tcp_listen_start(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_listen_opt *lopt;
+
+	sk->sk_max_ack_backlog = 0;
+	sk->sk_ack_backlog = 0;
+	tp->accept_queue = tp->accept_queue_tail = NULL;
+	rwlock_init(&tp->syn_wait_lock);
+	tcp_delack_init(tp);
+
+	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
+	if (!lopt)
+		return -ENOMEM;
+
+	memset(lopt, 0, sizeof(struct tcp_listen_opt));
+	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
+		if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog)
+			break;
+	get_random_bytes(&lopt->hash_rnd, 4);
+
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt = lopt;
+	write_unlock_bh(&tp->syn_wait_lock);
+
+	/* There is race window here: we announce ourselves listening,
+	 * but this transition is still not validated by get_port().
+	 * It is OK, because this socket enters to hash table only
+	 * after validation is complete.
+	 */
+	sk->sk_state = TCP_LISTEN;
+	if (!sk->sk_prot->get_port(sk, inet->num)) {
+		inet->sport = htons(inet->num);
+
+		sk_dst_reset(sk);
+		sk->sk_prot->hash(sk);
+
+		return 0;
+	}
+
+	sk->sk_state = TCP_CLOSE;
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt = NULL;
+	write_unlock_bh(&tp->syn_wait_lock);
+	kfree(lopt);
+	return -EADDRINUSE;
+}
+
+/*
+ *	This routine closes sockets which have been at least partially
+ *	opened, but not yet accepted.
+ */
+
+static void tcp_listen_stop (struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct open_request *acc_req = tp->accept_queue;
+	struct open_request *req;
+	int i;
+
+	tcp_delete_keepalive_timer(sk);
+
+	/* make all the listen_opt local to us */
+	write_lock_bh(&tp->syn_wait_lock);
+	tp->listen_opt = NULL;
+	write_unlock_bh(&tp->syn_wait_lock);
+	tp->accept_queue = tp->accept_queue_tail = NULL;
+
+	if (lopt->qlen) {
+		for (i = 0; i < TCP_SYNQ_HSIZE; i++) {
+			while ((req = lopt->syn_table[i]) != NULL) {
+				lopt->syn_table[i] = req->dl_next;
+				lopt->qlen--;
+				tcp_openreq_free(req);
+
+		/* Following specs, it would be better either to send FIN
+		 * (and enter FIN-WAIT-1, it is normal close)
+		 * or to send active reset (abort).
+		 * Certainly, it is pretty dangerous while synflood, but it is
+		 * bad justification for our negligence 8)
+		 * To be honest, we are not able to make either
+		 * of the variants now.			--ANK
+		 */
+			}
+		}
+	}
+	BUG_TRAP(!lopt->qlen);
+
+	kfree(lopt);
+
+	while ((req = acc_req) != NULL) {
+		struct sock *child = req->sk;
+
+		acc_req = req->dl_next;
+
+		local_bh_disable();
+		bh_lock_sock(child);
+		BUG_TRAP(!sock_owned_by_user(child));
+		sock_hold(child);
+
+		tcp_disconnect(child, O_NONBLOCK);
+
+		sock_orphan(child);
+
+		atomic_inc(&tcp_orphan_count);
+
+		tcp_destroy_sock(child);
+
+		bh_unlock_sock(child);
+		local_bh_enable();
+		sock_put(child);
+
+		sk_acceptq_removed(sk);
+		tcp_openreq_fastfree(req);
+	}
+	BUG_TRAP(!sk->sk_ack_backlog);
+}
+
+static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+	tp->pushed_seq = tp->write_seq;
+}
+
+static inline int forced_push(struct tcp_sock *tp)
+{
+	return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1));
+}
+
+static inline void skb_entail(struct sock *sk, struct tcp_sock *tp,
+			      struct sk_buff *skb)
+{
+	skb->csum = 0;
+	TCP_SKB_CB(skb)->seq = tp->write_seq;
+	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
+	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+	TCP_SKB_CB(skb)->sacked = 0;
+	skb_header_release(skb);
+	__skb_queue_tail(&sk->sk_write_queue, skb);
+	sk_charge_skb(sk, skb);
+	if (!sk->sk_send_head)
+		sk->sk_send_head = skb;
+	else if (tp->nonagle&TCP_NAGLE_PUSH)
+		tp->nonagle &= ~TCP_NAGLE_PUSH; 
+}
+
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
+				struct sk_buff *skb)
+{
+	if (flags & MSG_OOB) {
+		tp->urg_mode = 1;
+		tp->snd_up = tp->write_seq;
+		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
+	}
+}
+
+static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags,
+			    int mss_now, int nonagle)
+{
+	if (sk->sk_send_head) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		if (!(flags & MSG_MORE) || forced_push(tp))
+			tcp_mark_push(tp, skb);
+		tcp_mark_urg(tp, flags, skb);
+		__tcp_push_pending_frames(sk, tp, mss_now,
+					  (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
+	}
+}
+
+static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
+			 size_t psize, int flags)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now;
+	int err;
+	ssize_t copied;
+	long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_err;
+
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto do_error;
+
+	while (psize > 0) {
+		struct sk_buff *skb = sk->sk_write_queue.prev;
+		struct page *page = pages[poffset / PAGE_SIZE];
+		int copy, i, can_coalesce;
+		int offset = poffset % PAGE_SIZE;
+		int size = min_t(size_t, psize, PAGE_SIZE - offset);
+
+		if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
+new_segment:
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_sndbuf;
+
+			skb = sk_stream_alloc_pskb(sk, 0, 0,
+						   sk->sk_allocation);
+			if (!skb)
+				goto wait_for_memory;
+
+			skb_entail(sk, tp, skb);
+			copy = mss_now;
+		}
+
+		if (copy > size)
+			copy = size;
+
+		i = skb_shinfo(skb)->nr_frags;
+		can_coalesce = skb_can_coalesce(skb, i, page, offset);
+		if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+			tcp_mark_push(tp, skb);
+			goto new_segment;
+		}
+		if (sk->sk_forward_alloc < copy &&
+		    !sk_stream_mem_schedule(sk, copy, 0))
+			goto wait_for_memory;
+		
+		if (can_coalesce) {
+			skb_shinfo(skb)->frags[i - 1].size += copy;
+		} else {
+			get_page(page);
+			skb_fill_page_desc(skb, i, page, offset, copy);
+		}
+
+		skb->len += copy;
+		skb->data_len += copy;
+		skb->truesize += copy;
+		sk->sk_wmem_queued += copy;
+		sk->sk_forward_alloc -= copy;
+		skb->ip_summed = CHECKSUM_HW;
+		tp->write_seq += copy;
+		TCP_SKB_CB(skb)->end_seq += copy;
+		skb_shinfo(skb)->tso_segs = 0;
+
+		if (!copied)
+			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+		copied += copy;
+		poffset += copy;
+		if (!(psize -= copy))
+			goto out;
+
+		if (skb->len != mss_now || (flags & MSG_OOB))
+			continue;
+
+		if (forced_push(tp)) {
+			tcp_mark_push(tp, skb);
+			__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+		} else if (skb == sk->sk_send_head)
+			tcp_push_one(sk, mss_now);
+		continue;
+
+wait_for_sndbuf:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+		if (copied)
+			tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+		if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+			goto do_error;
+
+		mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
+	return copied;
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	return sk_stream_error(sk, flags, err);
+}
+
+ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
+		     size_t size, int flags)
+{
+	ssize_t res;
+	struct sock *sk = sock->sk;
+
+#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
+
+	if (!(sk->sk_route_caps & NETIF_F_SG) ||
+	    !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS))
+		return sock_no_sendpage(sock, page, offset, size, flags);
+
+#undef TCP_ZC_CSUM_FLAGS
+
+	lock_sock(sk);
+	TCP_CHECK_TIMER(sk);
+	res = do_tcp_sendpages(sk, &page, offset, size, flags);
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	return res;
+}
+
+#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+
+static inline int select_size(struct sock *sk, struct tcp_sock *tp)
+{
+	int tmp = tp->mss_cache_std;
+
+	if (sk->sk_route_caps & NETIF_F_SG) {
+		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
+
+		if (tmp >= pgbreak &&
+		    tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE)
+			tmp = pgbreak;
+	}
+	return tmp;
+}
+
+int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t size)
+{
+	struct iovec *iov;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int iovlen, flags;
+	int mss_now;
+	int err, copied;
+	long timeo;
+
+	lock_sock(sk);
+	TCP_CHECK_TIMER(sk);
+
+	flags = msg->msg_flags;
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Wait for a connection to finish. */
+	if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
+		if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+			goto out_err;
+
+	/* This should be in poll */
+	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+
+	mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+
+	/* Ok commence sending. */
+	iovlen = msg->msg_iovlen;
+	iov = msg->msg_iov;
+	copied = 0;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto do_error;
+
+	while (--iovlen >= 0) {
+		int seglen = iov->iov_len;
+		unsigned char __user *from = iov->iov_base;
+
+		iov++;
+
+		while (seglen > 0) {
+			int copy;
+
+			skb = sk->sk_write_queue.prev;
+
+			if (!sk->sk_send_head ||
+			    (copy = mss_now - skb->len) <= 0) {
+
+new_segment:
+				/* Allocate new segment. If the interface is SG,
+				 * allocate skb fitting to single page.
+				 */
+				if (!sk_stream_memory_free(sk))
+					goto wait_for_sndbuf;
+
+				skb = sk_stream_alloc_pskb(sk, select_size(sk, tp),
+							   0, sk->sk_allocation);
+				if (!skb)
+					goto wait_for_memory;
+
+				/*
+				 * Check whether we can use HW checksum.
+				 */
+				if (sk->sk_route_caps &
+				    (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM |
+				     NETIF_F_HW_CSUM))
+					skb->ip_summed = CHECKSUM_HW;
+
+				skb_entail(sk, tp, skb);
+				copy = mss_now;
+			}
+
+			/* Try to append data to the end of skb. */
+			if (copy > seglen)
+				copy = seglen;
+
+			/* Where to copy to? */
+			if (skb_tailroom(skb) > 0) {
+				/* We have some space in skb head. Superb! */
+				if (copy > skb_tailroom(skb))
+					copy = skb_tailroom(skb);
+				if ((err = skb_add_data(skb, from, copy)) != 0)
+					goto do_fault;
+			} else {
+				int merge = 0;
+				int i = skb_shinfo(skb)->nr_frags;
+				struct page *page = TCP_PAGE(sk);
+				int off = TCP_OFF(sk);
+
+				if (skb_can_coalesce(skb, i, page, off) &&
+				    off != PAGE_SIZE) {
+					/* We can extend the last page
+					 * fragment. */
+					merge = 1;
+				} else if (i == MAX_SKB_FRAGS ||
+					   (!i &&
+					   !(sk->sk_route_caps & NETIF_F_SG))) {
+					/* Need to add new fragment and cannot
+					 * do this because interface is non-SG,
+					 * or because all the page slots are
+					 * busy. */
+					tcp_mark_push(tp, skb);
+					goto new_segment;
+				} else if (page) {
+					/* If page is cached, align
+					 * offset to L1 cache boundary
+					 */
+					off = (off + L1_CACHE_BYTES - 1) &
+					      ~(L1_CACHE_BYTES - 1);
+					if (off == PAGE_SIZE) {
+						put_page(page);
+						TCP_PAGE(sk) = page = NULL;
+					}
+				}
+
+				if (!page) {
+					/* Allocate new cache page. */
+					if (!(page = sk_stream_alloc_page(sk)))
+						goto wait_for_memory;
+					off = 0;
+				}
+
+				if (copy > PAGE_SIZE - off)
+					copy = PAGE_SIZE - off;
+
+				/* Time to copy data. We are close to
+				 * the end! */
+				err = skb_copy_to_page(sk, from, skb, page,
+						       off, copy);
+				if (err) {
+					/* If this page was new, give it to the
+					 * socket so it does not get leaked.
+					 */
+					if (!TCP_PAGE(sk)) {
+						TCP_PAGE(sk) = page;
+						TCP_OFF(sk) = 0;
+					}
+					goto do_error;
+				}
+
+				/* Update the skb. */
+				if (merge) {
+					skb_shinfo(skb)->frags[i - 1].size +=
+									copy;
+				} else {
+					skb_fill_page_desc(skb, i, page, off, copy);
+					if (TCP_PAGE(sk)) {
+						get_page(page);
+					} else if (off + copy < PAGE_SIZE) {
+						get_page(page);
+						TCP_PAGE(sk) = page;
+					}
+				}
+
+				TCP_OFF(sk) = off + copy;
+			}
+
+			if (!copied)
+				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
+
+			tp->write_seq += copy;
+			TCP_SKB_CB(skb)->end_seq += copy;
+			skb_shinfo(skb)->tso_segs = 0;
+
+			from += copy;
+			copied += copy;
+			if ((seglen -= copy) == 0 && iovlen == 0)
+				goto out;
+
+			if (skb->len != mss_now || (flags & MSG_OOB))
+				continue;
+
+			if (forced_push(tp)) {
+				tcp_mark_push(tp, skb);
+				__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH);
+			} else if (skb == sk->sk_send_head)
+				tcp_push_one(sk, mss_now);
+			continue;
+
+wait_for_sndbuf:
+			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+wait_for_memory:
+			if (copied)
+				tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH);
+
+			if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+				goto do_error;
+
+			mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
+		}
+	}
+
+out:
+	if (copied)
+		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	return copied;
+
+do_fault:
+	if (!skb->len) {
+		if (sk->sk_send_head == skb)
+			sk->sk_send_head = NULL;
+		__skb_unlink(skb, skb->list);
+		sk_stream_free_skb(sk, skb);
+	}
+
+do_error:
+	if (copied)
+		goto out;
+out_err:
+	err = sk_stream_error(sk, flags, err);
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	return err;
+}
+
+/*
+ *	Handle reading urgent data. BSD has very simple semantics for
+ *	this, no blocking and very strange errors 8)
+ */
+
+static int tcp_recv_urg(struct sock *sk, long timeo,
+			struct msghdr *msg, int len, int flags,
+			int *addr_len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* No URG data to read. */
+	if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data ||
+	    tp->urg_data == TCP_URG_READ)
+		return -EINVAL;	/* Yes this is right ! */
+
+	if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE))
+		return -ENOTCONN;
+
+	if (tp->urg_data & TCP_URG_VALID) {
+		int err = 0;
+		char c = tp->urg_data;
+
+		if (!(flags & MSG_PEEK))
+			tp->urg_data = TCP_URG_READ;
+
+		/* Read urgent data. */
+		msg->msg_flags |= MSG_OOB;
+
+		if (len > 0) {
+			if (!(flags & MSG_TRUNC))
+				err = memcpy_toiovec(msg->msg_iov, &c, 1);
+			len = 1;
+		} else
+			msg->msg_flags |= MSG_TRUNC;
+
+		return err ? -EFAULT : len;
+	}
+
+	if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN))
+		return 0;
+
+	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
+	 * the available implementations agree in this case:
+	 * this call should never block, independent of the
+	 * blocking state of the socket.
+	 * Mike <pall@rz.uni-karlsruhe.de>
+	 */
+	return -EAGAIN;
+}
+
+/* Clean up the receive buffer for full frames taken by the user,
+ * then send an ACK if necessary.  COPIED is the number of bytes
+ * tcp_recvmsg has given to the user so far, it speeds up the
+ * calculation of whether or not we must ACK for the sake of
+ * a window update.
+ */
+static void cleanup_rbuf(struct sock *sk, int copied)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int time_to_ack = 0;
+
+#if TCP_DEBUG
+	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+
+	BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
+#endif
+
+	if (tcp_ack_scheduled(tp)) {
+		   /* Delayed ACKs frequently hit locked sockets during bulk
+		    * receive. */
+		if (tp->ack.blocked ||
+		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
+		    tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss ||
+		    /*
+		     * If this read emptied read buffer, we send ACK, if
+		     * connection is not bidirectional, user drained
+		     * receive buffer and there was a small segment
+		     * in queue.
+		     */
+		    (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) &&
+		     !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc)))
+			time_to_ack = 1;
+	}
+
+	/* We send an ACK if we can now advertise a non-zero window
+	 * which has been raised "significantly".
+	 *
+	 * Even if window raised up to infinity, do not send window open ACK
+	 * in states, where we will not receive more. It is useless.
+	 */
+	if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+		__u32 rcv_window_now = tcp_receive_window(tp);
+
+		/* Optimize, __tcp_select_window() is not cheap. */
+		if (2*rcv_window_now <= tp->window_clamp) {
+			__u32 new_window = __tcp_select_window(sk);
+
+			/* Send ACK now, if this read freed lots of space
+			 * in our buffer. Certainly, new_window is new window.
+			 * We can advertise it now, if it is not less than current one.
+			 * "Lots" means "at least twice" here.
+			 */
+			if (new_window && new_window >= 2 * rcv_window_now)
+				time_to_ack = 1;
+		}
+	}
+	if (time_to_ack)
+		tcp_send_ack(sk);
+}
+
+static void tcp_prequeue_process(struct sock *sk)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue));
+
+	/* RX process wants to run with disabled BHs, though it is not
+	 * necessary */
+	local_bh_disable();
+	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+		sk->sk_backlog_rcv(sk, skb);
+	local_bh_enable();
+
+	/* Clear memory counter. */
+	tp->ucopy.memory = 0;
+}
+
+static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
+{
+	struct sk_buff *skb;
+	u32 offset;
+
+	skb_queue_walk(&sk->sk_receive_queue, skb) {
+		offset = seq - TCP_SKB_CB(skb)->seq;
+		if (skb->h.th->syn)
+			offset--;
+		if (offset < skb->len || skb->h.th->fin) {
+			*off = offset;
+			return skb;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * This routine provides an alternative to tcp_recvmsg() for routines
+ * that would like to handle copying from skbuffs directly in 'sendfile'
+ * fashion.
+ * Note:
+ *	- It is assumed that the socket was locked by the caller.
+ *	- The routine does not block.
+ *	- At present, there is no support for reading OOB data
+ *	  or for 'peeking' the socket using this routine
+ *	  (although both would be easy to implement).
+ */
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+		  sk_read_actor_t recv_actor)
+{
+	struct sk_buff *skb;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 seq = tp->copied_seq;
+	u32 offset;
+	int copied = 0;
+
+	if (sk->sk_state == TCP_LISTEN)
+		return -ENOTCONN;
+	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
+		if (offset < skb->len) {
+			size_t used, len;
+
+			len = skb->len - offset;
+			/* Stop reading if we hit a patch of urgent data */
+			if (tp->urg_data) {
+				u32 urg_offset = tp->urg_seq - seq;
+				if (urg_offset < len)
+					len = urg_offset;
+				if (!len)
+					break;
+			}
+			used = recv_actor(desc, skb, offset, len);
+			if (used <= len) {
+				seq += used;
+				copied += used;
+				offset += used;
+			}
+			if (offset != skb->len)
+				break;
+		}
+		if (skb->h.th->fin) {
+			sk_eat_skb(sk, skb);
+			++seq;
+			break;
+		}
+		sk_eat_skb(sk, skb);
+		if (!desc->count)
+			break;
+	}
+	tp->copied_seq = seq;
+
+	tcp_rcv_space_adjust(sk);
+
+	/* Clean up data we have read: This will do ACK frames. */
+	if (copied)
+		cleanup_rbuf(sk, copied);
+	return copied;
+}
+
+/*
+ *	This routine copies from a sock struct into the user buffer.
+ *
+ *	Technical note: in 2.3 we work on _locked_ socket, so that
+ *	tricks with *seq access order and skb->users are not required.
+ *	Probably, code can be easily improved even more.
+ */
+
+int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t len, int nonblock, int flags, int *addr_len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int copied = 0;
+	u32 peek_seq;
+	u32 *seq;
+	unsigned long used;
+	int err;
+	int target;		/* Read at least this many bytes */
+	long timeo;
+	struct task_struct *user_recv = NULL;
+
+	lock_sock(sk);
+
+	TCP_CHECK_TIMER(sk);
+
+	err = -ENOTCONN;
+	if (sk->sk_state == TCP_LISTEN)
+		goto out;
+
+	timeo = sock_rcvtimeo(sk, nonblock);
+
+	/* Urgent data needs to be handled specially. */
+	if (flags & MSG_OOB)
+		goto recv_urg;
+
+	seq = &tp->copied_seq;
+	if (flags & MSG_PEEK) {
+		peek_seq = tp->copied_seq;
+		seq = &peek_seq;
+	}
+
+	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
+
+	do {
+		struct sk_buff *skb;
+		u32 offset;
+
+		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
+		if (tp->urg_data && tp->urg_seq == *seq) {
+			if (copied)
+				break;
+			if (signal_pending(current)) {
+				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
+				break;
+			}
+		}
+
+		/* Next get a buffer. */
+
+		skb = skb_peek(&sk->sk_receive_queue);
+		do {
+			if (!skb)
+				break;
+
+			/* Now that we have two receive queues this
+			 * shouldn't happen.
+			 */
+			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
+				printk(KERN_INFO "recvmsg bug: copied %X "
+				       "seq %X\n", *seq, TCP_SKB_CB(skb)->seq);
+				break;
+			}
+			offset = *seq - TCP_SKB_CB(skb)->seq;
+			if (skb->h.th->syn)
+				offset--;
+			if (offset < skb->len)
+				goto found_ok_skb;
+			if (skb->h.th->fin)
+				goto found_fin_ok;
+			BUG_TRAP(flags & MSG_PEEK);
+			skb = skb->next;
+		} while (skb != (struct sk_buff *)&sk->sk_receive_queue);
+
+		/* Well, if we have backlog, try to process it now yet. */
+
+		if (copied >= target && !sk->sk_backlog.tail)
+			break;
+
+		if (copied) {
+			if (sk->sk_err ||
+			    sk->sk_state == TCP_CLOSE ||
+			    (sk->sk_shutdown & RCV_SHUTDOWN) ||
+			    !timeo ||
+			    signal_pending(current) ||
+			    (flags & MSG_PEEK))
+				break;
+		} else {
+			if (sock_flag(sk, SOCK_DONE))
+				break;
+
+			if (sk->sk_err) {
+				copied = sock_error(sk);
+				break;
+			}
+
+			if (sk->sk_shutdown & RCV_SHUTDOWN)
+				break;
+
+			if (sk->sk_state == TCP_CLOSE) {
+				if (!sock_flag(sk, SOCK_DONE)) {
+					/* This occurs when user tries to read
+					 * from never connected socket.
+					 */
+					copied = -ENOTCONN;
+					break;
+				}
+				break;
+			}
+
+			if (!timeo) {
+				copied = -EAGAIN;
+				break;
+			}
+
+			if (signal_pending(current)) {
+				copied = sock_intr_errno(timeo);
+				break;
+			}
+		}
+
+		cleanup_rbuf(sk, copied);
+
+		if (tp->ucopy.task == user_recv) {
+			/* Install new reader */
+			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
+				user_recv = current;
+				tp->ucopy.task = user_recv;
+				tp->ucopy.iov = msg->msg_iov;
+			}
+
+			tp->ucopy.len = len;
+
+			BUG_TRAP(tp->copied_seq == tp->rcv_nxt ||
+				 (flags & (MSG_PEEK | MSG_TRUNC)));
+
+			/* Ugly... If prequeue is not empty, we have to
+			 * process it before releasing socket, otherwise
+			 * order will be broken at second iteration.
+			 * More elegant solution is required!!!
+			 *
+			 * Look: we have the following (pseudo)queues:
+			 *
+			 * 1. packets in flight
+			 * 2. backlog
+			 * 3. prequeue
+			 * 4. receive_queue
+			 *
+			 * Each queue can be processed only if the next ones
+			 * are empty. At this point we have empty receive_queue.
+			 * But prequeue _can_ be not empty after 2nd iteration,
+			 * when we jumped to start of loop because backlog
+			 * processing added something to receive_queue.
+			 * We cannot release_sock(), because backlog contains
+			 * packets arrived _after_ prequeued ones.
+			 *
+			 * Shortly, algorithm is clear --- to process all
+			 * the queues in order. We could make it more directly,
+			 * requeueing packets from backlog to prequeue, if
+			 * is not empty. It is more elegant, but eats cycles,
+			 * unfortunately.
+			 */
+			if (skb_queue_len(&tp->ucopy.prequeue))
+				goto do_prequeue;
+
+			/* __ Set realtime policy in scheduler __ */
+		}
+
+		if (copied >= target) {
+			/* Do not sleep, just process backlog. */
+			release_sock(sk);
+			lock_sock(sk);
+		} else
+			sk_wait_data(sk, &timeo);
+
+		if (user_recv) {
+			int chunk;
+
+			/* __ Restore normal policy in scheduler __ */
+
+			if ((chunk = len - tp->ucopy.len) != 0) {
+				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
+				len -= chunk;
+				copied += chunk;
+			}
+
+			if (tp->rcv_nxt == tp->copied_seq &&
+			    skb_queue_len(&tp->ucopy.prequeue)) {
+do_prequeue:
+				tcp_prequeue_process(sk);
+
+				if ((chunk = len - tp->ucopy.len) != 0) {
+					NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+					len -= chunk;
+					copied += chunk;
+				}
+			}
+		}
+		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
+				       current->comm, current->pid);
+			peek_seq = tp->copied_seq;
+		}
+		continue;
+
+	found_ok_skb:
+		/* Ok so how much can we use? */
+		used = skb->len - offset;
+		if (len < used)
+			used = len;
+
+		/* Do we have urgent data here? */
+		if (tp->urg_data) {
+			u32 urg_offset = tp->urg_seq - *seq;
+			if (urg_offset < used) {
+				if (!urg_offset) {
+					if (!sock_flag(sk, SOCK_URGINLINE)) {
+						++*seq;
+						offset++;
+						used--;
+						if (!used)
+							goto skip_copy;
+					}
+				} else
+					used = urg_offset;
+			}
+		}
+
+		if (!(flags & MSG_TRUNC)) {
+			err = skb_copy_datagram_iovec(skb, offset,
+						      msg->msg_iov, used);
+			if (err) {
+				/* Exception. Bailout! */
+				if (!copied)
+					copied = -EFAULT;
+				break;
+			}
+		}
+
+		*seq += used;
+		copied += used;
+		len -= used;
+
+		tcp_rcv_space_adjust(sk);
+
+skip_copy:
+		if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) {
+			tp->urg_data = 0;
+			tcp_fast_path_check(sk, tp);
+		}
+		if (used + offset < skb->len)
+			continue;
+
+		if (skb->h.th->fin)
+			goto found_fin_ok;
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
+		continue;
+
+	found_fin_ok:
+		/* Process the FIN. */
+		++*seq;
+		if (!(flags & MSG_PEEK))
+			sk_eat_skb(sk, skb);
+		break;
+	} while (len > 0);
+
+	if (user_recv) {
+		if (skb_queue_len(&tp->ucopy.prequeue)) {
+			int chunk;
+
+			tp->ucopy.len = copied > 0 ? len : 0;
+
+			tcp_prequeue_process(sk);
+
+			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
+				NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
+				len -= chunk;
+				copied += chunk;
+			}
+		}
+
+		tp->ucopy.task = NULL;
+		tp->ucopy.len = 0;
+	}
+
+	/* According to UNIX98, msg_name/msg_namelen are ignored
+	 * on connected socket. I was just happy when found this 8) --ANK
+	 */
+
+	/* Clean up data we have read: This will do ACK frames. */
+	cleanup_rbuf(sk, copied);
+
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	return copied;
+
+out:
+	TCP_CHECK_TIMER(sk);
+	release_sock(sk);
+	return err;
+
+recv_urg:
+	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
+	goto out;
+}
+
+/*
+ *	State processing on a close. This implements the state shift for
+ *	sending our FIN frame. Note that we only send a FIN for some
+ *	states. A shutdown() may have already sent the FIN, or we may be
+ *	closed.
+ */
+
+static unsigned char new_state[16] = {
+  /* current state:        new state:      action:	*/
+  /* (Invalid)		*/ TCP_CLOSE,
+  /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  /* TCP_SYN_SENT	*/ TCP_CLOSE,
+  /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
+  /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
+  /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
+  /* TCP_TIME_WAIT	*/ TCP_CLOSE,
+  /* TCP_CLOSE		*/ TCP_CLOSE,
+  /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
+  /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
+  /* TCP_LISTEN		*/ TCP_CLOSE,
+  /* TCP_CLOSING	*/ TCP_CLOSING,
+};
+
+static int tcp_close_state(struct sock *sk)
+{
+	int next = (int)new_state[sk->sk_state];
+	int ns = next & TCP_STATE_MASK;
+
+	tcp_set_state(sk, ns);
+
+	return next & TCP_ACTION_FIN;
+}
+
+/*
+ *	Shutdown the sending side of a connection. Much like close except
+ *	that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD).
+ */
+
+void tcp_shutdown(struct sock *sk, int how)
+{
+	/*	We need to grab some memory, and put together a FIN,
+	 *	and then put it into the queue to be sent.
+	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
+	 */
+	if (!(how & SEND_SHUTDOWN))
+		return;
+
+	/* If we've already sent a FIN, or it's a closed state, skip this. */
+	if ((1 << sk->sk_state) &
+	    (TCPF_ESTABLISHED | TCPF_SYN_SENT |
+	     TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) {
+		/* Clear out any half completed packets.  FIN if needed. */
+		if (tcp_close_state(sk))
+			tcp_send_fin(sk);
+	}
+}
+
+/*
+ * At this point, there should be no process reference to this
+ * socket, and thus no user references at all.  Therefore we
+ * can assume the socket waitqueue is inactive and nobody will
+ * try to jump onto it.
+ */
+void tcp_destroy_sock(struct sock *sk)
+{
+	BUG_TRAP(sk->sk_state == TCP_CLOSE);
+	BUG_TRAP(sock_flag(sk, SOCK_DEAD));
+
+	/* It cannot be in hash table! */
+	BUG_TRAP(sk_unhashed(sk));
+
+	/* If it has not 0 inet_sk(sk)->num, it must be bound */
+	BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash);
+
+	sk->sk_prot->destroy(sk);
+
+	sk_stream_kill_queues(sk);
+
+	xfrm_sk_free_policy(sk);
+
+#ifdef INET_REFCNT_DEBUG
+	if (atomic_read(&sk->sk_refcnt) != 1) {
+		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n",
+		       sk, atomic_read(&sk->sk_refcnt));
+	}
+#endif
+
+	atomic_dec(&tcp_orphan_count);
+	sock_put(sk);
+}
+
+void tcp_close(struct sock *sk, long timeout)
+{
+	struct sk_buff *skb;
+	int data_was_unread = 0;
+
+	lock_sock(sk);
+	sk->sk_shutdown = SHUTDOWN_MASK;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		tcp_set_state(sk, TCP_CLOSE);
+
+		/* Special case. */
+		tcp_listen_stop(sk);
+
+		goto adjudge_to_death;
+	}
+
+	/*  We need to flush the recv. buffs.  We do this only on the
+	 *  descriptor close, not protocol-sourced closes, because the
+	 *  reader process may not have drained the data yet!
+	 */
+	while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq -
+			  skb->h.th->fin;
+		data_was_unread += len;
+		__kfree_skb(skb);
+	}
+
+	sk_stream_mem_reclaim(sk);
+
+	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
+	 * 3.10, we send a RST here because data was lost.  To
+	 * witness the awful effects of the old behavior of always
+	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
+	 * a bulk GET in an FTP client, suspend the process, wait
+	 * for the client to advertise a zero window, then kill -9
+	 * the FTP client, wheee...  Note: timeout is always zero
+	 * in such a case.
+	 */
+	if (data_was_unread) {
+		/* Unread data was tossed, zap the connection. */
+		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE);
+		tcp_set_state(sk, TCP_CLOSE);
+		tcp_send_active_reset(sk, GFP_KERNEL);
+	} else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+		/* Check zero linger _after_ checking for unread data. */
+		sk->sk_prot->disconnect(sk, 0);
+		NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA);
+	} else if (tcp_close_state(sk)) {
+		/* We FIN if the application ate all the data before
+		 * zapping the connection.
+		 */
+
+		/* RED-PEN. Formally speaking, we have broken TCP state
+		 * machine. State transitions:
+		 *
+		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
+		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
+		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
+		 *
+		 * are legal only when FIN has been sent (i.e. in window),
+		 * rather than queued out of window. Purists blame.
+		 *
+		 * F.e. "RFC state" is ESTABLISHED,
+		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
+		 *
+		 * The visible declinations are that sometimes
+		 * we enter time-wait state, when it is not required really
+		 * (harmless), do not send active resets, when they are
+		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
+		 * they look as CLOSING or LAST_ACK for Linux)
+		 * Probably, I missed some more holelets.
+		 * 						--ANK
+		 */
+		tcp_send_fin(sk);
+	}
+
+	sk_stream_wait_close(sk, timeout);
+
+adjudge_to_death:
+	/* It is the last release_sock in its life. It will remove backlog. */
+	release_sock(sk);
+
+
+	/* Now socket is owned by kernel and we acquire BH lock
+	   to finish close. No need to check for user refs.
+	 */
+	local_bh_disable();
+	bh_lock_sock(sk);
+	BUG_TRAP(!sock_owned_by_user(sk));
+
+	sock_hold(sk);
+	sock_orphan(sk);
+
+	/*	This is a (useful) BSD violating of the RFC. There is a
+	 *	problem with TCP as specified in that the other end could
+	 *	keep a socket open forever with no application left this end.
+	 *	We use a 3 minute timeout (about the same as BSD) then kill
+	 *	our end. If they send after that then tough - BUT: long enough
+	 *	that we won't make the old 4*rto = almost no time - whoops
+	 *	reset mistake.
+	 *
+	 *	Nope, it was not mistake. It is really desired behaviour
+	 *	f.e. on http servers, when such sockets are useless, but
+	 *	consume significant resources. Let's do it with special
+	 *	linger2	option.					--ANK
+	 */
+
+	if (sk->sk_state == TCP_FIN_WAIT2) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		if (tp->linger2 < 0) {
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER);
+		} else {
+			int tmo = tcp_fin_time(tp);
+
+			if (tmo > TCP_TIMEWAIT_LEN) {
+				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
+			} else {
+				atomic_inc(&tcp_orphan_count);
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+	}
+	if (sk->sk_state != TCP_CLOSE) {
+		sk_stream_mem_reclaim(sk);
+		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
+		    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+			if (net_ratelimit())
+				printk(KERN_INFO "TCP: too many of orphaned "
+				       "sockets\n");
+			tcp_set_state(sk, TCP_CLOSE);
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+		}
+	}
+	atomic_inc(&tcp_orphan_count);
+
+	if (sk->sk_state == TCP_CLOSE)
+		tcp_destroy_sock(sk);
+	/* Otherwise, socket is reprieved until protocol close. */
+
+out:
+	bh_unlock_sock(sk);
+	local_bh_enable();
+	sock_put(sk);
+}
+
+/* These states need RST on ABORT according to RFC793 */
+
+static inline int tcp_need_reset(int state)
+{
+	return (1 << state) &
+	       (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 |
+		TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
+}
+
+int tcp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err = 0;
+	int old_state = sk->sk_state;
+
+	if (old_state != TCP_CLOSE)
+		tcp_set_state(sk, TCP_CLOSE);
+
+	/* ABORT function of RFC793 */
+	if (old_state == TCP_LISTEN) {
+		tcp_listen_stop(sk);
+	} else if (tcp_need_reset(old_state) ||
+		   (tp->snd_nxt != tp->write_seq &&
+		    (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) {
+		/* The last check adjusts for discrepance of Linux wrt. RFC
+		 * states
+		 */
+		tcp_send_active_reset(sk, gfp_any());
+		sk->sk_err = ECONNRESET;
+	} else if (old_state == TCP_SYN_SENT)
+		sk->sk_err = ECONNRESET;
+
+	tcp_clear_xmit_timers(sk);
+	__skb_queue_purge(&sk->sk_receive_queue);
+	sk_stream_writequeue_purge(sk);
+	__skb_queue_purge(&tp->out_of_order_queue);
+
+	inet->dport = 0;
+
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		inet_reset_saddr(sk);
+
+	sk->sk_shutdown = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+	tp->srtt = 0;
+	if ((tp->write_seq += tp->max_window + 2) == 0)
+		tp->write_seq = 1;
+	tp->backoff = 0;
+	tp->snd_cwnd = 2;
+	tp->probes_out = 0;
+	tp->packets_out = 0;
+	tp->snd_ssthresh = 0x7fffffff;
+	tp->snd_cwnd_cnt = 0;
+	tcp_set_ca_state(tp, TCP_CA_Open);
+	tcp_clear_retrans(tp);
+	tcp_delack_init(tp);
+	sk->sk_send_head = NULL;
+	tp->rx_opt.saw_tstamp = 0;
+	tcp_sack_reset(&tp->rx_opt);
+	__sk_dst_reset(sk);
+
+	BUG_TRAP(!inet->num || tp->bind_hash);
+
+	sk->sk_error_report(sk);
+	return err;
+}
+
+/*
+ *	Wait for an incoming connection, avoid race
+ *	conditions. This must be called with the socket locked.
+ */
+static int wait_for_connect(struct sock *sk, long timeo)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	DEFINE_WAIT(wait);
+	int err;
+
+	/*
+	 * True wake-one mechanism for incoming connections: only
+	 * one process gets woken up, not the 'whole herd'.
+	 * Since we do not 'race & poll' for established sockets
+	 * anymore, the common case will execute the loop only once.
+	 *
+	 * Subtle issue: "add_wait_queue_exclusive()" will be added
+	 * after any current non-exclusive waiters, and we know that
+	 * it will always _stay_ after any new non-exclusive waiters
+	 * because all non-exclusive waiters are added at the
+	 * beginning of the wait-queue. As such, it's ok to "drop"
+	 * our exclusiveness temporarily when we get woken up without
+	 * having to remove and re-insert us on the wait queue.
+	 */
+	for (;;) {
+		prepare_to_wait_exclusive(sk->sk_sleep, &wait,
+					  TASK_INTERRUPTIBLE);
+		release_sock(sk);
+		if (!tp->accept_queue)
+			timeo = schedule_timeout(timeo);
+		lock_sock(sk);
+		err = 0;
+		if (tp->accept_queue)
+			break;
+		err = -EINVAL;
+		if (sk->sk_state != TCP_LISTEN)
+			break;
+		err = sock_intr_errno(timeo);
+		if (signal_pending(current))
+			break;
+		err = -EAGAIN;
+		if (!timeo)
+			break;
+	}
+	finish_wait(sk->sk_sleep, &wait);
+	return err;
+}
+
+/*
+ *	This will accept the next outstanding connection.
+ */
+
+struct sock *tcp_accept(struct sock *sk, int flags, int *err)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct open_request *req;
+	struct sock *newsk;
+	int error;
+
+	lock_sock(sk);
+
+	/* We need to make sure that this socket is listening,
+	 * and that it has something pending.
+	 */
+	error = -EINVAL;
+	if (sk->sk_state != TCP_LISTEN)
+		goto out;
+
+	/* Find already established connection */
+	if (!tp->accept_queue) {
+		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
+
+		/* If this is a non blocking socket don't sleep */
+		error = -EAGAIN;
+		if (!timeo)
+			goto out;
+
+		error = wait_for_connect(sk, timeo);
+		if (error)
+			goto out;
+	}
+
+	req = tp->accept_queue;
+	if ((tp->accept_queue = req->dl_next) == NULL)
+		tp->accept_queue_tail = NULL;
+
+ 	newsk = req->sk;
+	sk_acceptq_removed(sk);
+	tcp_openreq_fastfree(req);
+	BUG_TRAP(newsk->sk_state != TCP_SYN_RECV);
+	release_sock(sk);
+	return newsk;
+
+out:
+	release_sock(sk);
+	*err = error;
+	return NULL;
+}
+
+/*
+ *	Socket option code for TCP.
+ */
+int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   int optlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int val;
+	int err = 0;
+
+	if (level != SOL_TCP)
+		return tp->af_specific->setsockopt(sk, level, optname,
+						   optval, optlen);
+
+	if (optlen < sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case TCP_MAXSEG:
+		/* Values greater than interface MTU won't take effect. However
+		 * at the point when this call is done we typically don't yet
+		 * know which interface is going to be used */
+		if (val < 8 || val > MAX_TCP_WINDOW) {
+			err = -EINVAL;
+			break;
+		}
+		tp->rx_opt.user_mss = val;
+		break;
+
+	case TCP_NODELAY:
+		if (val) {
+			/* TCP_NODELAY is weaker than TCP_CORK, so that
+			 * this option on corked socket is remembered, but
+			 * it is not activated until cork is cleared.
+			 *
+			 * However, when TCP_NODELAY is set we make
+			 * an explicit push, which overrides even TCP_CORK
+			 * for currently queued segments.
+			 */
+			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;
+			tcp_push_pending_frames(sk, tp);
+		} else {
+			tp->nonagle &= ~TCP_NAGLE_OFF;
+		}
+		break;
+
+	case TCP_CORK:
+		/* When set indicates to always queue non-full frames.
+		 * Later the user clears this option and we transmit
+		 * any pending partial frames in the queue.  This is
+		 * meant to be used alongside sendfile() to get properly
+		 * filled frames when the user (for example) must write
+		 * out headers with a write() call first and then use
+		 * sendfile to send out the data parts.
+		 *
+		 * TCP_CORK can be set together with TCP_NODELAY and it is
+		 * stronger than TCP_NODELAY.
+		 */
+		if (val) {
+			tp->nonagle |= TCP_NAGLE_CORK;
+		} else {
+			tp->nonagle &= ~TCP_NAGLE_CORK;
+			if (tp->nonagle&TCP_NAGLE_OFF)
+				tp->nonagle |= TCP_NAGLE_PUSH;
+			tcp_push_pending_frames(sk, tp);
+		}
+		break;
+
+	case TCP_KEEPIDLE:
+		if (val < 1 || val > MAX_TCP_KEEPIDLE)
+			err = -EINVAL;
+		else {
+			tp->keepalive_time = val * HZ;
+			if (sock_flag(sk, SOCK_KEEPOPEN) &&
+			    !((1 << sk->sk_state) &
+			      (TCPF_CLOSE | TCPF_LISTEN))) {
+				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
+				if (tp->keepalive_time > elapsed)
+					elapsed = tp->keepalive_time - elapsed;
+				else
+					elapsed = 0;
+				tcp_reset_keepalive_timer(sk, elapsed);
+			}
+		}
+		break;
+	case TCP_KEEPINTVL:
+		if (val < 1 || val > MAX_TCP_KEEPINTVL)
+			err = -EINVAL;
+		else
+			tp->keepalive_intvl = val * HZ;
+		break;
+	case TCP_KEEPCNT:
+		if (val < 1 || val > MAX_TCP_KEEPCNT)
+			err = -EINVAL;
+		else
+			tp->keepalive_probes = val;
+		break;
+	case TCP_SYNCNT:
+		if (val < 1 || val > MAX_TCP_SYNCNT)
+			err = -EINVAL;
+		else
+			tp->syn_retries = val;
+		break;
+
+	case TCP_LINGER2:
+		if (val < 0)
+			tp->linger2 = -1;
+		else if (val > sysctl_tcp_fin_timeout / HZ)
+			tp->linger2 = 0;
+		else
+			tp->linger2 = val * HZ;
+		break;
+
+	case TCP_DEFER_ACCEPT:
+		tp->defer_accept = 0;
+		if (val > 0) {
+			/* Translate value in seconds to number of
+			 * retransmits */
+			while (tp->defer_accept < 32 &&
+			       val > ((TCP_TIMEOUT_INIT / HZ) <<
+				       tp->defer_accept))
+				tp->defer_accept++;
+			tp->defer_accept++;
+		}
+		break;
+
+	case TCP_WINDOW_CLAMP:
+		if (!val) {
+			if (sk->sk_state != TCP_CLOSE) {
+				err = -EINVAL;
+				break;
+			}
+			tp->window_clamp = 0;
+		} else
+			tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ?
+						SOCK_MIN_RCVBUF / 2 : val;
+		break;
+
+	case TCP_QUICKACK:
+		if (!val) {
+			tp->ack.pingpong = 1;
+		} else {
+			tp->ack.pingpong = 0;
+			if ((1 << sk->sk_state) &
+			    (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) &&
+			    tcp_ack_scheduled(tp)) {
+				tp->ack.pending |= TCP_ACK_PUSHED;
+				cleanup_rbuf(sk, 1);
+				if (!(val & 1))
+					tp->ack.pingpong = 1;
+			}
+		}
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	};
+	release_sock(sk);
+	return err;
+}
+
+/* Return information about state of tcp endpoint in API format. */
+void tcp_get_info(struct sock *sk, struct tcp_info *info)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 now = tcp_time_stamp;
+
+	memset(info, 0, sizeof(*info));
+
+	info->tcpi_state = sk->sk_state;
+	info->tcpi_ca_state = tp->ca_state;
+	info->tcpi_retransmits = tp->retransmits;
+	info->tcpi_probes = tp->probes_out;
+	info->tcpi_backoff = tp->backoff;
+
+	if (tp->rx_opt.tstamp_ok)
+		info->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+	if (tp->rx_opt.sack_ok)
+		info->tcpi_options |= TCPI_OPT_SACK;
+	if (tp->rx_opt.wscale_ok) {
+		info->tcpi_options |= TCPI_OPT_WSCALE;
+		info->tcpi_snd_wscale = tp->rx_opt.snd_wscale;
+		info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale;
+	} 
+
+	if (tp->ecn_flags&TCP_ECN_OK)
+		info->tcpi_options |= TCPI_OPT_ECN;
+
+	info->tcpi_rto = jiffies_to_usecs(tp->rto);
+	info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
+	info->tcpi_snd_mss = tp->mss_cache_std;
+	info->tcpi_rcv_mss = tp->ack.rcv_mss;
+
+	info->tcpi_unacked = tp->packets_out;
+	info->tcpi_sacked = tp->sacked_out;
+	info->tcpi_lost = tp->lost_out;
+	info->tcpi_retrans = tp->retrans_out;
+	info->tcpi_fackets = tp->fackets_out;
+
+	info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime);
+	info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime);
+	info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp);
+
+	info->tcpi_pmtu = tp->pmtu_cookie;
+	info->tcpi_rcv_ssthresh = tp->rcv_ssthresh;
+	info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3;
+	info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2;
+	info->tcpi_snd_ssthresh = tp->snd_ssthresh;
+	info->tcpi_snd_cwnd = tp->snd_cwnd;
+	info->tcpi_advmss = tp->advmss;
+	info->tcpi_reordering = tp->reordering;
+
+	info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3;
+	info->tcpi_rcv_space = tp->rcvq_space.space;
+
+	info->tcpi_total_retrans = tp->total_retrans;
+}
+
+EXPORT_SYMBOL_GPL(tcp_get_info);
+
+int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,
+		   int __user *optlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int val, len;
+
+	if (level != SOL_TCP)
+		return tp->af_specific->getsockopt(sk, level, optname,
+						   optval, optlen);
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+
+	if (len < 0)
+		return -EINVAL;
+
+	switch (optname) {
+	case TCP_MAXSEG:
+		val = tp->mss_cache_std;
+		if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
+			val = tp->rx_opt.user_mss;
+		break;
+	case TCP_NODELAY:
+		val = !!(tp->nonagle&TCP_NAGLE_OFF);
+		break;
+	case TCP_CORK:
+		val = !!(tp->nonagle&TCP_NAGLE_CORK);
+		break;
+	case TCP_KEEPIDLE:
+		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ;
+		break;
+	case TCP_KEEPINTVL:
+		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ;
+		break;
+	case TCP_KEEPCNT:
+		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
+		break;
+	case TCP_SYNCNT:
+		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
+		break;
+	case TCP_LINGER2:
+		val = tp->linger2;
+		if (val >= 0)
+			val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+		break;
+	case TCP_DEFER_ACCEPT:
+		val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) <<
+					       (tp->defer_accept - 1));
+		break;
+	case TCP_WINDOW_CLAMP:
+		val = tp->window_clamp;
+		break;
+	case TCP_INFO: {
+		struct tcp_info info;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		tcp_get_info(sk, &info);
+
+		len = min_t(unsigned int, len, sizeof(info));
+		if (put_user(len, optlen))
+			return -EFAULT;
+		if (copy_to_user(optval, &info, len))
+			return -EFAULT;
+		return 0;
+	}
+	case TCP_QUICKACK:
+		val = !tp->ack.pingpong;
+		break;
+	default:
+		return -ENOPROTOOPT;
+	};
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+	if (copy_to_user(optval, &val, len))
+		return -EFAULT;
+	return 0;
+}
+
+
+extern void __skb_cb_too_small_for_tcp(int, int);
+extern void tcpdiag_init(void);
+
+static __initdata unsigned long thash_entries;
+static int __init set_thash_entries(char *str)
+{
+	if (!str)
+		return 0;
+	thash_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("thash_entries=", set_thash_entries);
+
+void __init tcp_init(void)
+{
+	struct sk_buff *skb = NULL;
+	int order, i;
+
+	if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
+		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
+					   sizeof(skb->cb));
+
+	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
+						   sizeof(struct open_request),
+					       0, SLAB_HWCACHE_ALIGN,
+					       NULL, NULL);
+	if (!tcp_openreq_cachep)
+		panic("tcp_init: Cannot alloc open_request cache.");
+
+	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
+					      sizeof(struct tcp_bind_bucket),
+					      0, SLAB_HWCACHE_ALIGN,
+					      NULL, NULL);
+	if (!tcp_bucket_cachep)
+		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
+
+	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
+						sizeof(struct tcp_tw_bucket),
+						0, SLAB_HWCACHE_ALIGN,
+						NULL, NULL);
+	if (!tcp_timewait_cachep)
+		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
+
+	/* Size and allocate the main established and bind bucket
+	 * hash tables.
+	 *
+	 * The methodology is similar to that of the buffer cache.
+	 */
+	tcp_ehash = (struct tcp_ehash_bucket *)
+		alloc_large_system_hash("TCP established",
+					sizeof(struct tcp_ehash_bucket),
+					thash_entries,
+					(num_physpages >= 128 * 1024) ?
+						(25 - PAGE_SHIFT) :
+						(27 - PAGE_SHIFT),
+					HASH_HIGHMEM,
+					&tcp_ehash_size,
+					NULL,
+					0);
+	tcp_ehash_size = (1 << tcp_ehash_size) >> 1;
+	for (i = 0; i < (tcp_ehash_size << 1); i++) {
+		rwlock_init(&tcp_ehash[i].lock);
+		INIT_HLIST_HEAD(&tcp_ehash[i].chain);
+	}
+
+	tcp_bhash = (struct tcp_bind_hashbucket *)
+		alloc_large_system_hash("TCP bind",
+					sizeof(struct tcp_bind_hashbucket),
+					tcp_ehash_size,
+					(num_physpages >= 128 * 1024) ?
+						(25 - PAGE_SHIFT) :
+						(27 - PAGE_SHIFT),
+					HASH_HIGHMEM,
+					&tcp_bhash_size,
+					NULL,
+					64 * 1024);
+	tcp_bhash_size = 1 << tcp_bhash_size;
+	for (i = 0; i < tcp_bhash_size; i++) {
+		spin_lock_init(&tcp_bhash[i].lock);
+		INIT_HLIST_HEAD(&tcp_bhash[i].chain);
+	}
+
+	/* Try to be a bit smarter and adjust defaults depending
+	 * on available memory.
+	 */
+	for (order = 0; ((1 << order) << PAGE_SHIFT) <
+			(tcp_bhash_size * sizeof(struct tcp_bind_hashbucket));
+			order++)
+		;
+	if (order > 4) {
+		sysctl_local_port_range[0] = 32768;
+		sysctl_local_port_range[1] = 61000;
+		sysctl_tcp_max_tw_buckets = 180000;
+		sysctl_tcp_max_orphans = 4096 << (order - 4);
+		sysctl_max_syn_backlog = 1024;
+	} else if (order < 3) {
+		sysctl_local_port_range[0] = 1024 * (3 - order);
+		sysctl_tcp_max_tw_buckets >>= (3 - order);
+		sysctl_tcp_max_orphans >>= (3 - order);
+		sysctl_max_syn_backlog = 128;
+	}
+	tcp_port_rover = sysctl_local_port_range[0] - 1;
+
+	sysctl_tcp_mem[0] =  768 << order;
+	sysctl_tcp_mem[1] = 1024 << order;
+	sysctl_tcp_mem[2] = 1536 << order;
+
+	if (order < 3) {
+		sysctl_tcp_wmem[2] = 64 * 1024;
+		sysctl_tcp_rmem[0] = PAGE_SIZE;
+		sysctl_tcp_rmem[1] = 43689;
+		sysctl_tcp_rmem[2] = 2 * 43689;
+	}
+
+	printk(KERN_INFO "TCP: Hash tables configured "
+	       "(established %d bind %d)\n",
+	       tcp_ehash_size << 1, tcp_bhash_size);
+}
+
+EXPORT_SYMBOL(tcp_accept);
+EXPORT_SYMBOL(tcp_close);
+EXPORT_SYMBOL(tcp_destroy_sock);
+EXPORT_SYMBOL(tcp_disconnect);
+EXPORT_SYMBOL(tcp_getsockopt);
+EXPORT_SYMBOL(tcp_ioctl);
+EXPORT_SYMBOL(tcp_openreq_cachep);
+EXPORT_SYMBOL(tcp_poll);
+EXPORT_SYMBOL(tcp_read_sock);
+EXPORT_SYMBOL(tcp_recvmsg);
+EXPORT_SYMBOL(tcp_sendmsg);
+EXPORT_SYMBOL(tcp_sendpage);
+EXPORT_SYMBOL(tcp_setsockopt);
+EXPORT_SYMBOL(tcp_shutdown);
+EXPORT_SYMBOL(tcp_statistics);
+EXPORT_SYMBOL(tcp_timewait_cachep);
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
new file mode 100644
index 000000000000..313c1408da33
--- /dev/null
+++ b/net/ipv4/tcp_diag.c
@@ -0,0 +1,802 @@
+/*
+ * tcp_diag.c	Module for monitoring TCP sockets.
+ *
+ * Version:	$Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/time.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+
+#include <linux/inet.h>
+#include <linux/stddef.h>
+
+#include <linux/tcp_diag.h>
+
+struct tcpdiag_entry
+{
+	u32 *saddr;
+	u32 *daddr;
+	u16 sport;
+	u16 dport;
+	u16 family;
+	u16 userlocks;
+};
+
+static struct sock *tcpnl;
+
+
+#define TCPDIAG_PUT(skb, attrtype, attrlen) \
+({ int rtalen = RTA_LENGTH(attrlen);        \
+   struct rtattr *rta;                      \
+   if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \
+   rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \
+   rta->rta_type = attrtype;                \
+   rta->rta_len = rtalen;                   \
+   RTA_DATA(rta); })
+
+static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk,
+			int ext, u32 pid, u32 seq, u16 nlmsg_flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcpdiagmsg *r;
+	struct nlmsghdr  *nlh;
+	struct tcp_info  *info = NULL;
+	struct tcpdiag_meminfo  *minfo = NULL;
+	struct tcpvegas_info *vinfo = NULL;
+	unsigned char	 *b = skb->tail;
+
+	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+	nlh->nlmsg_flags = nlmsg_flags;
+	r = NLMSG_DATA(nlh);
+	if (sk->sk_state != TCP_TIME_WAIT) {
+		if (ext & (1<<(TCPDIAG_MEMINFO-1)))
+			minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo));
+		if (ext & (1<<(TCPDIAG_INFO-1)))
+			info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info));
+		
+		if ((tcp_is_westwood(tp) || tcp_is_vegas(tp))
+		    && (ext & (1<<(TCPDIAG_VEGASINFO-1))))
+			vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo));
+	}
+	r->tcpdiag_family = sk->sk_family;
+	r->tcpdiag_state = sk->sk_state;
+	r->tcpdiag_timer = 0;
+	r->tcpdiag_retrans = 0;
+
+	r->id.tcpdiag_if = sk->sk_bound_dev_if;
+	r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk;
+	r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+
+	if (r->tcpdiag_state == TCP_TIME_WAIT) {
+		struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk;
+		long tmo = tw->tw_ttd - jiffies;
+		if (tmo < 0)
+			tmo = 0;
+
+		r->id.tcpdiag_sport = tw->tw_sport;
+		r->id.tcpdiag_dport = tw->tw_dport;
+		r->id.tcpdiag_src[0] = tw->tw_rcv_saddr;
+		r->id.tcpdiag_dst[0] = tw->tw_daddr;
+		r->tcpdiag_state = tw->tw_substate;
+		r->tcpdiag_timer = 3;
+		r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ;
+		r->tcpdiag_rqueue = 0;
+		r->tcpdiag_wqueue = 0;
+		r->tcpdiag_uid = 0;
+		r->tcpdiag_inode = 0;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+		if (r->tcpdiag_family == AF_INET6) {
+			ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+				       &tw->tw_v6_rcv_saddr);
+			ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+				       &tw->tw_v6_daddr);
+		}
+#endif
+		nlh->nlmsg_len = skb->tail - b;
+		return skb->len;
+	}
+
+	r->id.tcpdiag_sport = inet->sport;
+	r->id.tcpdiag_dport = inet->dport;
+	r->id.tcpdiag_src[0] = inet->rcv_saddr;
+	r->id.tcpdiag_dst[0] = inet->daddr;
+
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+	if (r->tcpdiag_family == AF_INET6) {
+		struct ipv6_pinfo *np = inet6_sk(sk);
+
+		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+			       &np->rcv_saddr);
+		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+			       &np->daddr);
+	}
+#endif
+
+#define EXPIRES_IN_MS(tmo)  ((tmo-jiffies)*1000+HZ-1)/HZ
+
+	if (tp->pending == TCP_TIME_RETRANS) {
+		r->tcpdiag_timer = 1;
+		r->tcpdiag_retrans = tp->retransmits;
+		r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+	} else if (tp->pending == TCP_TIME_PROBE0) {
+		r->tcpdiag_timer = 4;
+		r->tcpdiag_retrans = tp->probes_out;
+		r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout);
+	} else if (timer_pending(&sk->sk_timer)) {
+		r->tcpdiag_timer = 2;
+		r->tcpdiag_retrans = tp->probes_out;
+		r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires);
+	} else {
+		r->tcpdiag_timer = 0;
+		r->tcpdiag_expires = 0;
+	}
+#undef EXPIRES_IN_MS
+
+	r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+	r->tcpdiag_wqueue = tp->write_seq - tp->snd_una;
+	r->tcpdiag_uid = sock_i_uid(sk);
+	r->tcpdiag_inode = sock_i_ino(sk);
+
+	if (minfo) {
+		minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc);
+		minfo->tcpdiag_wmem = sk->sk_wmem_queued;
+		minfo->tcpdiag_fmem = sk->sk_forward_alloc;
+		minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc);
+	}
+
+	if (info) 
+		tcp_get_info(sk, info);
+
+	if (vinfo) {
+		if (tcp_is_vegas(tp)) {
+			vinfo->tcpv_enabled = tp->vegas.doing_vegas_now;
+			vinfo->tcpv_rttcnt = tp->vegas.cntRTT;
+			vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT);
+			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT);
+		} else {
+			vinfo->tcpv_enabled = 0;
+			vinfo->tcpv_rttcnt = 0;
+			vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt);
+			vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min);
+		}
+	}
+
+	nlh->nlmsg_len = skb->tail - b;
+	return skb->len;
+
+nlmsg_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport,
+				  int dif);
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+				  struct in6_addr *daddr, u16 dport,
+				  int dif);
+#else
+static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+					 struct in6_addr *daddr, u16 dport,
+					 int dif)
+{
+	return NULL;
+}
+#endif
+
+static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh)
+{
+	int err;
+	struct sock *sk;
+	struct tcpdiagreq *req = NLMSG_DATA(nlh);
+	struct sk_buff *rep;
+
+	if (req->tcpdiag_family == AF_INET) {
+		sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport,
+				   req->id.tcpdiag_src[0], req->id.tcpdiag_sport,
+				   req->id.tcpdiag_if);
+	}
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+	else if (req->tcpdiag_family == AF_INET6) {
+		sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport,
+				   (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport,
+				   req->id.tcpdiag_if);
+	}
+#endif
+	else {
+		return -EINVAL;
+	}
+
+	if (sk == NULL)
+		return -ENOENT;
+
+	err = -ESTALE;
+	if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE ||
+	     req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) &&
+	    ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] ||
+	     (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1]))
+		goto out;
+
+	err = -ENOMEM;
+	rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+
+				    sizeof(struct tcpdiag_meminfo)+
+				    sizeof(struct tcp_info)+64), GFP_KERNEL);
+	if (!rep)
+		goto out;
+
+	if (tcpdiag_fill(rep, sk, req->tcpdiag_ext,
+			 NETLINK_CB(in_skb).pid,
+			 nlh->nlmsg_seq, 0) <= 0)
+		BUG();
+
+	err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+
+out:
+	if (sk) {
+		if (sk->sk_state == TCP_TIME_WAIT)
+			tcp_tw_put((struct tcp_tw_bucket*)sk);
+		else
+			sock_put(sk);
+	}
+	return err;
+}
+
+static int bitstring_match(const u32 *a1, const u32 *a2, int bits)
+{
+	int words = bits >> 5;
+
+	bits &= 0x1f;
+
+	if (words) {
+		if (memcmp(a1, a2, words << 2))
+			return 0;
+	}
+	if (bits) {
+		__u32 w1, w2;
+		__u32 mask;
+
+		w1 = a1[words];
+		w2 = a2[words];
+
+		mask = htonl((0xffffffff) << (32 - bits));
+
+		if ((w1 ^ w2) & mask)
+			return 0;
+	}
+
+	return 1;
+}
+
+
+static int tcpdiag_bc_run(const void *bc, int len,
+			  const struct tcpdiag_entry *entry)
+{
+	while (len > 0) {
+		int yes = 1;
+		const struct tcpdiag_bc_op *op = bc;
+
+		switch (op->code) {
+		case TCPDIAG_BC_NOP:
+			break;
+		case TCPDIAG_BC_JMP:
+			yes = 0;
+			break;
+		case TCPDIAG_BC_S_GE:
+			yes = entry->sport >= op[1].no;
+			break;
+		case TCPDIAG_BC_S_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case TCPDIAG_BC_D_GE:
+			yes = entry->dport >= op[1].no;
+			break;
+		case TCPDIAG_BC_D_LE:
+			yes = entry->dport <= op[1].no;
+			break;
+		case TCPDIAG_BC_AUTO:
+			yes = !(entry->userlocks & SOCK_BINDPORT_LOCK);
+			break;
+		case TCPDIAG_BC_S_COND:
+		case TCPDIAG_BC_D_COND:
+		{
+			struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1);
+			u32 *addr;
+
+			if (cond->port != -1 &&
+			    cond->port != (op->code == TCPDIAG_BC_S_COND ?
+					     entry->sport : entry->dport)) {
+				yes = 0;
+				break;
+			}
+			
+			if (cond->prefix_len == 0)
+				break;
+
+			if (op->code == TCPDIAG_BC_S_COND)
+				addr = entry->saddr;
+			else
+				addr = entry->daddr;
+
+			if (bitstring_match(addr, cond->addr, cond->prefix_len))
+				break;
+			if (entry->family == AF_INET6 &&
+			    cond->family == AF_INET) {
+				if (addr[0] == 0 && addr[1] == 0 &&
+				    addr[2] == htonl(0xffff) &&
+				    bitstring_match(addr+3, cond->addr, cond->prefix_len))
+					break;
+			}
+			yes = 0;
+			break;
+		}
+		}
+
+		if (yes) { 
+			len -= op->yes;
+			bc += op->yes;
+		} else {
+			len -= op->no;
+			bc += op->no;
+		}
+	}
+	return (len == 0);
+}
+
+static int valid_cc(const void *bc, int len, int cc)
+{
+	while (len >= 0) {
+		const struct tcpdiag_bc_op *op = bc;
+
+		if (cc > len)
+			return 0;
+		if (cc == len)
+			return 1;
+		if (op->yes < 4)
+			return 0;
+		len -= op->yes;
+		bc  += op->yes;
+	}
+	return 0;
+}
+
+static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len)
+{
+	const unsigned char *bc = bytecode;
+	int  len = bytecode_len;
+
+	while (len > 0) {
+		struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc;
+
+//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len);
+		switch (op->code) {
+		case TCPDIAG_BC_AUTO:
+		case TCPDIAG_BC_S_COND:
+		case TCPDIAG_BC_D_COND:
+		case TCPDIAG_BC_S_GE:
+		case TCPDIAG_BC_S_LE:
+		case TCPDIAG_BC_D_GE:
+		case TCPDIAG_BC_D_LE:
+			if (op->yes < 4 || op->yes > len+4)
+				return -EINVAL;
+		case TCPDIAG_BC_JMP:
+			if (op->no < 4 || op->no > len+4)
+				return -EINVAL;
+			if (op->no < len &&
+			    !valid_cc(bytecode, bytecode_len, len-op->no))
+				return -EINVAL;
+			break;
+		case TCPDIAG_BC_NOP:
+			if (op->yes < 4 || op->yes > len+4)
+				return -EINVAL;
+			break;
+		default:
+			return -EINVAL;
+		}
+		bc += op->yes;
+		len -= op->yes;
+	}
+	return len == 0 ? 0 : -EINVAL;
+}
+
+static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk,
+			     struct netlink_callback *cb)
+{
+	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+
+	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+		struct tcpdiag_entry entry;
+		struct rtattr *bc = (struct rtattr *)(r + 1);
+		struct inet_sock *inet = inet_sk(sk);
+
+		entry.family = sk->sk_family;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+		if (entry.family == AF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			entry.saddr = np->rcv_saddr.s6_addr32;
+			entry.daddr = np->daddr.s6_addr32;
+		} else
+#endif
+		{
+			entry.saddr = &inet->rcv_saddr;
+			entry.daddr = &inet->daddr;
+		}
+		entry.sport = inet->num;
+		entry.dport = ntohs(inet->dport);
+		entry.userlocks = sk->sk_userlocks;
+
+		if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
+			return 0;
+	}
+
+	return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid,
+			    cb->nlh->nlmsg_seq, NLM_F_MULTI);
+}
+
+static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk,
+			    struct open_request *req,
+			    u32 pid, u32 seq)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	unsigned char *b = skb->tail;
+	struct tcpdiagmsg *r;
+	struct nlmsghdr *nlh;
+	long tmo;
+
+	nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r));
+	nlh->nlmsg_flags = NLM_F_MULTI;
+	r = NLMSG_DATA(nlh);
+
+	r->tcpdiag_family = sk->sk_family;
+	r->tcpdiag_state = TCP_SYN_RECV;
+	r->tcpdiag_timer = 1;
+	r->tcpdiag_retrans = req->retrans;
+
+	r->id.tcpdiag_if = sk->sk_bound_dev_if;
+	r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req;
+	r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1);
+
+	tmo = req->expires - jiffies;
+	if (tmo < 0)
+		tmo = 0;
+
+	r->id.tcpdiag_sport = inet->sport;
+	r->id.tcpdiag_dport = req->rmt_port;
+	r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr;
+	r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr;
+	r->tcpdiag_expires = jiffies_to_msecs(tmo),
+	r->tcpdiag_rqueue = 0;
+	r->tcpdiag_wqueue = 0;
+	r->tcpdiag_uid = sock_i_uid(sk);
+	r->tcpdiag_inode = 0;
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+	if (r->tcpdiag_family == AF_INET6) {
+		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src,
+			       &req->af.v6_req.loc_addr);
+		ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst,
+			       &req->af.v6_req.rmt_addr);
+	}
+#endif
+	nlh->nlmsg_len = skb->tail - b;
+
+	return skb->len;
+
+nlmsg_failure:
+	skb_trim(skb, b - skb->data);
+	return -1;
+}
+
+static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk,
+			     struct netlink_callback *cb)
+{
+	struct tcpdiag_entry entry;
+	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_listen_opt *lopt;
+	struct rtattr *bc = NULL;
+	struct inet_sock *inet = inet_sk(sk);
+	int j, s_j;
+	int reqnum, s_reqnum;
+	int err = 0;
+
+	s_j = cb->args[3];
+	s_reqnum = cb->args[4];
+
+	if (s_j > 0)
+		s_j--;
+
+	entry.family = sk->sk_family;
+
+	read_lock_bh(&tp->syn_wait_lock);
+
+	lopt = tp->listen_opt;
+	if (!lopt || !lopt->qlen)
+		goto out;
+
+	if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
+		bc = (struct rtattr *)(r + 1);
+		entry.sport = inet->num;
+		entry.userlocks = sk->sk_userlocks;
+	}
+
+	for (j = s_j; j < TCP_SYNQ_HSIZE; j++) {
+		struct open_request *req, *head = lopt->syn_table[j];
+
+		reqnum = 0;
+		for (req = head; req; reqnum++, req = req->dl_next) {
+			if (reqnum < s_reqnum)
+				continue;
+			if (r->id.tcpdiag_dport != req->rmt_port &&
+			    r->id.tcpdiag_dport)
+				continue;
+
+			if (bc) {
+				entry.saddr =
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+					(entry.family == AF_INET6) ?
+					req->af.v6_req.loc_addr.s6_addr32 :
+#endif
+					&req->af.v4_req.loc_addr;
+				entry.daddr = 
+#ifdef CONFIG_IP_TCPDIAG_IPV6
+					(entry.family == AF_INET6) ?
+					req->af.v6_req.rmt_addr.s6_addr32 :
+#endif
+					&req->af.v4_req.rmt_addr;
+				entry.dport = ntohs(req->rmt_port);
+
+				if (!tcpdiag_bc_run(RTA_DATA(bc),
+						    RTA_PAYLOAD(bc), &entry))
+					continue;
+			}
+
+			err = tcpdiag_fill_req(skb, sk, req,
+					       NETLINK_CB(cb->skb).pid,
+					       cb->nlh->nlmsg_seq);
+			if (err < 0) {
+				cb->args[3] = j + 1;
+				cb->args[4] = reqnum;
+				goto out;
+			}
+		}
+
+		s_reqnum = 0;
+	}
+
+out:
+	read_unlock_bh(&tp->syn_wait_lock);
+
+	return err;
+}
+
+static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	int i, num;
+	int s_i, s_num;
+	struct tcpdiagreq *r = NLMSG_DATA(cb->nlh);
+
+	s_i = cb->args[1];
+	s_num = num = cb->args[2];
+
+	if (cb->args[0] == 0) {
+		if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV)))
+			goto skip_listen_ht;
+		tcp_listen_lock();
+		for (i = s_i; i < TCP_LHTABLE_SIZE; i++) {
+			struct sock *sk;
+			struct hlist_node *node;
+
+			num = 0;
+			sk_for_each(sk, node, &tcp_listening_hash[i]) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num) {
+					num++;
+					continue;
+				}
+
+				if (r->id.tcpdiag_sport != inet->sport &&
+				    r->id.tcpdiag_sport)
+					goto next_listen;
+
+				if (!(r->tcpdiag_states&TCPF_LISTEN) ||
+				    r->id.tcpdiag_dport ||
+				    cb->args[3] > 0)
+					goto syn_recv;
+
+				if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+					tcp_listen_unlock();
+					goto done;
+				}
+
+syn_recv:
+				if (!(r->tcpdiag_states&TCPF_SYN_RECV))
+					goto next_listen;
+
+				if (tcpdiag_dump_reqs(skb, sk, cb) < 0) {
+					tcp_listen_unlock();
+					goto done;
+				}
+
+next_listen:
+				cb->args[3] = 0;
+				cb->args[4] = 0;
+				++num;
+			}
+
+			s_num = 0;
+			cb->args[3] = 0;
+			cb->args[4] = 0;
+		}
+		tcp_listen_unlock();
+skip_listen_ht:
+		cb->args[0] = 1;
+		s_i = num = s_num = 0;
+	}
+
+	if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV)))
+		return skb->len;
+
+	for (i = s_i; i < tcp_ehash_size; i++) {
+		struct tcp_ehash_bucket *head = &tcp_ehash[i];
+		struct sock *sk;
+		struct hlist_node *node;
+
+		if (i > s_i)
+			s_num = 0;
+
+		read_lock_bh(&head->lock);
+
+		num = 0;
+		sk_for_each(sk, node, &head->chain) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (num < s_num)
+				goto next_normal;
+			if (!(r->tcpdiag_states & (1 << sk->sk_state)))
+				goto next_normal;
+			if (r->id.tcpdiag_sport != inet->sport &&
+			    r->id.tcpdiag_sport)
+				goto next_normal;
+			if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport)
+				goto next_normal;
+			if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+				read_unlock_bh(&head->lock);
+				goto done;
+			}
+next_normal:
+			++num;
+		}
+
+		if (r->tcpdiag_states&TCPF_TIME_WAIT) {
+			sk_for_each(sk, node,
+				    &tcp_ehash[i + tcp_ehash_size].chain) {
+				struct inet_sock *inet = inet_sk(sk);
+
+				if (num < s_num)
+					goto next_dying;
+				if (r->id.tcpdiag_sport != inet->sport &&
+				    r->id.tcpdiag_sport)
+					goto next_dying;
+				if (r->id.tcpdiag_dport != inet->dport &&
+				    r->id.tcpdiag_dport)
+					goto next_dying;
+				if (tcpdiag_dump_sock(skb, sk, cb) < 0) {
+					read_unlock_bh(&head->lock);
+					goto done;
+				}
+next_dying:
+				++num;
+			}
+		}
+		read_unlock_bh(&head->lock);
+	}
+
+done:
+	cb->args[1] = i;
+	cb->args[2] = num;
+	return skb->len;
+}
+
+static int tcpdiag_dump_done(struct netlink_callback *cb)
+{
+	return 0;
+}
+
+
+static __inline__ int
+tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	if (!(nlh->nlmsg_flags&NLM_F_REQUEST))
+		return 0;
+
+	if (nlh->nlmsg_type != TCPDIAG_GETSOCK)
+		goto err_inval;
+
+	if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len)
+		goto err_inval;
+
+	if (nlh->nlmsg_flags&NLM_F_DUMP) {
+		if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) {
+			struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq));
+			if (rta->rta_type != TCPDIAG_REQ_BYTECODE ||
+			    rta->rta_len < 8 ||
+			    rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq)))
+				goto err_inval;
+			if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta)))
+				goto err_inval;
+		}
+		return netlink_dump_start(tcpnl, skb, nlh,
+					  tcpdiag_dump,
+					  tcpdiag_dump_done);
+	} else {
+		return tcpdiag_get_exact(skb, nlh);
+	}
+
+err_inval:
+	return -EINVAL;
+}
+
+
+static inline void tcpdiag_rcv_skb(struct sk_buff *skb)
+{
+	int err;
+	struct nlmsghdr * nlh;
+
+	if (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return;
+		err = tcpdiag_rcv_msg(skb, nlh);
+		if (err || nlh->nlmsg_flags & NLM_F_ACK) 
+			netlink_ack(skb, nlh, err);
+	}
+}
+
+static void tcpdiag_rcv(struct sock *sk, int len)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+		tcpdiag_rcv_skb(skb);
+		kfree_skb(skb);
+	}
+}
+
+static int __init tcpdiag_init(void)
+{
+	tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv);
+	if (tcpnl == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void __exit tcpdiag_exit(void)
+{
+	sock_release(tcpnl->sk_socket);
+}
+
+module_init(tcpdiag_init);
+module_exit(tcpdiag_exit);
+MODULE_LICENSE("GPL");
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
new file mode 100644
index 000000000000..250492735902
--- /dev/null
+++ b/net/ipv4/tcp_input.c
@@ -0,0 +1,4959 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:
+ *		Pedro Roque	:	Fast Retransmit/Recovery.
+ *					Two receive queues.
+ *					Retransmit queue handled by TCP.
+ *					Better retransmit timer handling.
+ *					New congestion avoidance.
+ *					Header prediction.
+ *					Variable renaming.
+ *
+ *		Eric		:	Fast Retransmit.
+ *		Randy Scott	:	MSS option defines.
+ *		Eric Schenk	:	Fixes to slow start algorithm.
+ *		Eric Schenk	:	Yet another double ACK bug.
+ *		Eric Schenk	:	Delayed ACK bug fixes.
+ *		Eric Schenk	:	Floyd style fast retrans war avoidance.
+ *		David S. Miller	:	Don't allow zero congestion window.
+ *		Eric Schenk	:	Fix retransmitter so that it sends
+ *					next packet on ack of previous packet.
+ *		Andi Kleen	:	Moved open_request checking here
+ *					and process RSTs for open_requests.
+ *		Andi Kleen	:	Better prune_queue, and other fixes.
+ *		Andrey Savochkin:	Fix RTT measurements in the presnce of
+ *					timestamps.
+ *		Andrey Savochkin:	Check sequence numbers correctly when
+ *					removing SACKs due to in sequence incoming
+ *					data segments.
+ *		Andi Kleen:		Make sure we never ack data there is not
+ *					enough room for. Also make this condition
+ *					a fatal error if it might still happen.
+ *		Andi Kleen:		Add tcp_measure_rcv_mss to make 
+ *					connections with MSS<min(MTU,ann. MSS)
+ *					work without delayed acks. 
+ *		Andi Kleen:		Process packets with PSH set in the
+ *					fast path.
+ *		J Hadi Salim:		ECN support
+ *	 	Andrei Gurtov,
+ *		Pasi Sarolahti,
+ *		Panu Kuhlberg:		Experimental audit of TCP (re)transmission
+ *					engine. Lots of bugs are found.
+ *		Pasi Sarolahti:		F-RTO for dealing with spurious RTOs
+ *		Angelo Dell'Aera:	TCP Westwood+ support
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <linux/ipsec.h>
+#include <asm/unaligned.h>
+
+int sysctl_tcp_timestamps = 1;
+int sysctl_tcp_window_scaling = 1;
+int sysctl_tcp_sack = 1;
+int sysctl_tcp_fack = 1;
+int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_ecn;
+int sysctl_tcp_dsack = 1;
+int sysctl_tcp_app_win = 31;
+int sysctl_tcp_adv_win_scale = 2;
+
+int sysctl_tcp_stdurg;
+int sysctl_tcp_rfc1337;
+int sysctl_tcp_max_orphans = NR_FILE;
+int sysctl_tcp_frto;
+int sysctl_tcp_nometrics_save;
+int sysctl_tcp_westwood;
+int sysctl_tcp_vegas_cong_avoid;
+
+int sysctl_tcp_moderate_rcvbuf = 1;
+
+/* Default values of the Vegas variables, in fixed-point representation
+ * with V_PARAM_SHIFT bits to the right of the binary point.
+ */
+#define V_PARAM_SHIFT 1
+int sysctl_tcp_vegas_alpha = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_beta  = 3<<V_PARAM_SHIFT;
+int sysctl_tcp_vegas_gamma = 1<<V_PARAM_SHIFT;
+int sysctl_tcp_bic = 1;
+int sysctl_tcp_bic_fast_convergence = 1;
+int sysctl_tcp_bic_low_window = 14;
+int sysctl_tcp_bic_beta = 819;		/* = 819/1024 (BICTCP_BETA_SCALE) */
+
+#define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
+#define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
+#define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
+#define FLAG_RETRANS_DATA_ACKED	0x08 /* "" "" some of which was retransmitted.	*/
+#define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
+#define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
+#define FLAG_ECE		0x40 /* ECE in this ACK				*/
+#define FLAG_DATA_LOST		0x80 /* SACK detected data lossage.		*/
+#define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
+
+#define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
+#define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
+#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
+
+#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0)
+#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
+#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
+
+#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
+
+/* Adapt the MSS value used to make delayed ack decision to the 
+ * real world.
+ */ 
+static inline void tcp_measure_rcv_mss(struct tcp_sock *tp,
+				       struct sk_buff *skb)
+{
+	unsigned int len, lss;
+
+	lss = tp->ack.last_seg_size; 
+	tp->ack.last_seg_size = 0; 
+
+	/* skb->len may jitter because of SACKs, even if peer
+	 * sends good full-sized frames.
+	 */
+	len = skb->len;
+	if (len >= tp->ack.rcv_mss) {
+		tp->ack.rcv_mss = len;
+	} else {
+		/* Otherwise, we make more careful check taking into account,
+		 * that SACKs block is variable.
+		 *
+		 * "len" is invariant segment length, including TCP header.
+		 */
+		len += skb->data - skb->h.raw;
+		if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+		    /* If PSH is not set, packet should be
+		     * full sized, provided peer TCP is not badly broken.
+		     * This observation (if it is correct 8)) allows
+		     * to handle super-low mtu links fairly.
+		     */
+		    (len >= TCP_MIN_MSS + sizeof(struct tcphdr) &&
+		     !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) {
+			/* Subtract also invariant (if peer is RFC compliant),
+			 * tcp header plus fixed timestamp option length.
+			 * Resulting "len" is MSS free of SACK jitter.
+			 */
+			len -= tp->tcp_header_len;
+			tp->ack.last_seg_size = len;
+			if (len == lss) {
+				tp->ack.rcv_mss = len;
+				return;
+			}
+		}
+		tp->ack.pending |= TCP_ACK_PUSHED;
+	}
+}
+
+static void tcp_incr_quickack(struct tcp_sock *tp)
+{
+	unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss);
+
+	if (quickacks==0)
+		quickacks=2;
+	if (quickacks > tp->ack.quick)
+		tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+}
+
+void tcp_enter_quickack_mode(struct tcp_sock *tp)
+{
+	tcp_incr_quickack(tp);
+	tp->ack.pingpong = 0;
+	tp->ack.ato = TCP_ATO_MIN;
+}
+
+/* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
+ */
+
+static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp)
+{
+	return (tp->ack.quick && !tp->ack.pingpong);
+}
+
+/* Buffer size and advertised window tuning.
+ *
+ * 1. Tuning sk->sk_sndbuf, when connection enters established state.
+ */
+
+static void tcp_fixup_sndbuf(struct sock *sk)
+{
+	int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 +
+		     sizeof(struct sk_buff);
+
+	if (sk->sk_sndbuf < 3 * sndmem)
+		sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);
+}
+
+/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
+ *
+ * All tcp_full_space() is split to two parts: "network" buffer, allocated
+ * forward and advertised in receiver window (tp->rcv_wnd) and
+ * "application buffer", required to isolate scheduling/application
+ * latencies from network.
+ * window_clamp is maximal advertised window. It can be less than
+ * tcp_full_space(), in this case tcp_full_space() - window_clamp
+ * is reserved for "application" buffer. The less window_clamp is
+ * the smoother our behaviour from viewpoint of network, but the lower
+ * throughput and the higher sensitivity of the connection to losses. 8)
+ *
+ * rcv_ssthresh is more strict window_clamp used at "slow start"
+ * phase to predict further behaviour of this connection.
+ * It is used for two goals:
+ * - to enforce header prediction at sender, even when application
+ *   requires some significant "application buffer". It is check #1.
+ * - to prevent pruning of receive queue because of misprediction
+ *   of receiver window. Check #2.
+ *
+ * The scheme does not work when sender sends good segments opening
+ * window and then starts to feed us spagetti. But it should work
+ * in common situations. Otherwise, we have to rely on queue collapsing.
+ */
+
+/* Slow part of check#2. */
+static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+			     struct sk_buff *skb)
+{
+	/* Optimize this! */
+	int truesize = tcp_win_from_space(skb->truesize)/2;
+	int window = tcp_full_space(sk)/2;
+
+	while (tp->rcv_ssthresh <= window) {
+		if (truesize <= skb->len)
+			return 2*tp->ack.rcv_mss;
+
+		truesize >>= 1;
+		window >>= 1;
+	}
+	return 0;
+}
+
+static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp,
+				   struct sk_buff *skb)
+{
+	/* Check #1 */
+	if (tp->rcv_ssthresh < tp->window_clamp &&
+	    (int)tp->rcv_ssthresh < tcp_space(sk) &&
+	    !tcp_memory_pressure) {
+		int incr;
+
+		/* Check #2. Increase window, if skb with such overhead
+		 * will fit to rcvbuf in future.
+		 */
+		if (tcp_win_from_space(skb->truesize) <= skb->len)
+			incr = 2*tp->advmss;
+		else
+			incr = __tcp_grow_window(sk, tp, skb);
+
+		if (incr) {
+			tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp);
+			tp->ack.quick |= 1;
+		}
+	}
+}
+
+/* 3. Tuning rcvbuf, when connection enters established state. */
+
+static void tcp_fixup_rcvbuf(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff);
+
+	/* Try to select rcvbuf so that 4 mss-sized segments
+	 * will fit to window and correspoding skbs will fit to our rcvbuf.
+	 * (was 3; 4 is minimum to allow fast retransmit to work.)
+	 */
+	while (tcp_win_from_space(rcvmem) < tp->advmss)
+		rcvmem += 128;
+	if (sk->sk_rcvbuf < 4 * rcvmem)
+		sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);
+}
+
+/* 4. Try to fixup all. It is made iimediately after connection enters
+ *    established state.
+ */
+static void tcp_init_buffer_space(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int maxwin;
+
+	if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+		tcp_fixup_rcvbuf(sk);
+	if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
+		tcp_fixup_sndbuf(sk);
+
+	tp->rcvq_space.space = tp->rcv_wnd;
+
+	maxwin = tcp_full_space(sk);
+
+	if (tp->window_clamp >= maxwin) {
+		tp->window_clamp = maxwin;
+
+		if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss)
+			tp->window_clamp = max(maxwin -
+					       (maxwin >> sysctl_tcp_app_win),
+					       4 * tp->advmss);
+	}
+
+	/* Force reservation of one segment. */
+	if (sysctl_tcp_app_win &&
+	    tp->window_clamp > 2 * tp->advmss &&
+	    tp->window_clamp + tp->advmss > maxwin)
+		tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
+
+	tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void init_bictcp(struct tcp_sock *tp)
+{
+	tp->bictcp.cnt = 0;
+
+	tp->bictcp.last_max_cwnd = 0;
+	tp->bictcp.last_cwnd = 0;
+	tp->bictcp.last_stamp = 0;
+}
+
+/* 5. Recalculate window clamp after socket hit its memory bounds. */
+static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
+{
+	struct sk_buff *skb;
+	unsigned int app_win = tp->rcv_nxt - tp->copied_seq;
+	int ofo_win = 0;
+
+	tp->ack.quick = 0;
+
+	skb_queue_walk(&tp->out_of_order_queue, skb) {
+		ofo_win += skb->len;
+	}
+
+	/* If overcommit is due to out of order segments,
+	 * do not clamp window. Try to expand rcvbuf instead.
+	 */
+	if (ofo_win) {
+		if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
+		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
+		    !tcp_memory_pressure &&
+		    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
+			sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
+					    sysctl_tcp_rmem[2]);
+	}
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) {
+		app_win += ofo_win;
+		if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf)
+			app_win >>= 1;
+		if (app_win > tp->ack.rcv_mss)
+			app_win -= tp->ack.rcv_mss;
+		app_win = max(app_win, 2U*tp->advmss);
+
+		if (!ofo_win)
+			tp->window_clamp = min(tp->window_clamp, app_win);
+		tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
+	}
+}
+
+/* Receiver "autotuning" code.
+ *
+ * The algorithm for RTT estimation w/o timestamps is based on
+ * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL.
+ * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps>
+ *
+ * More detail on this code can be found at
+ * <http://www.psc.edu/~jheffner/senior_thesis.ps>,
+ * though this reference is out of date.  A new paper
+ * is pending.
+ */
+static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep)
+{
+	u32 new_sample = tp->rcv_rtt_est.rtt;
+	long m = sample;
+
+	if (m == 0)
+		m = 1;
+
+	if (new_sample != 0) {
+		/* If we sample in larger samples in the non-timestamp
+		 * case, we could grossly overestimate the RTT especially
+		 * with chatty applications or bulk transfer apps which
+		 * are stalled on filesystem I/O.
+		 *
+		 * Also, since we are only going for a minimum in the
+		 * non-timestamp case, we do not smoothe things out
+		 * else with timestamps disabled convergance takes too
+		 * long.
+		 */
+		if (!win_dep) {
+			m -= (new_sample >> 3);
+			new_sample += m;
+		} else if (m < new_sample)
+			new_sample = m << 3;
+	} else {
+		/* No previous mesaure. */
+		new_sample = m << 3;
+	}
+
+	if (tp->rcv_rtt_est.rtt != new_sample)
+		tp->rcv_rtt_est.rtt = new_sample;
+}
+
+static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp)
+{
+	if (tp->rcv_rtt_est.time == 0)
+		goto new_measure;
+	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+		return;
+	tcp_rcv_rtt_update(tp,
+			   jiffies - tp->rcv_rtt_est.time,
+			   1);
+
+new_measure:
+	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;
+	tp->rcv_rtt_est.time = tcp_time_stamp;
+}
+
+static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	if (tp->rx_opt.rcv_tsecr &&
+	    (TCP_SKB_CB(skb)->end_seq -
+	     TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))
+		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);
+}
+
+/*
+ * This function should be called every time data is copied to user space.
+ * It calculates the appropriate TCP receive buffer space.
+ */
+void tcp_rcv_space_adjust(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int time;
+	int space;
+	
+	if (tp->rcvq_space.time == 0)
+		goto new_measure;
+	
+	time = tcp_time_stamp - tp->rcvq_space.time;
+	if (time < (tp->rcv_rtt_est.rtt >> 3) ||
+	    tp->rcv_rtt_est.rtt == 0)
+		return;
+	
+	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);
+
+	space = max(tp->rcvq_space.space, space);
+
+	if (tp->rcvq_space.space != space) {
+		int rcvmem;
+
+		tp->rcvq_space.space = space;
+
+		if (sysctl_tcp_moderate_rcvbuf) {
+			int new_clamp = space;
+
+			/* Receive space grows, normalize in order to
+			 * take into account packet headers and sk_buff
+			 * structure overhead.
+			 */
+			space /= tp->advmss;
+			if (!space)
+				space = 1;
+			rcvmem = (tp->advmss + MAX_TCP_HEADER +
+				  16 + sizeof(struct sk_buff));
+			while (tcp_win_from_space(rcvmem) < tp->advmss)
+				rcvmem += 128;
+			space *= rcvmem;
+			space = min(space, sysctl_tcp_rmem[2]);
+			if (space > sk->sk_rcvbuf) {
+				sk->sk_rcvbuf = space;
+
+				/* Make the window clamp follow along.  */
+				tp->window_clamp = new_clamp;
+			}
+		}
+	}
+	
+new_measure:
+	tp->rcvq_space.seq = tp->copied_seq;
+	tp->rcvq_space.time = tcp_time_stamp;
+}
+
+/* There is something which you must keep in mind when you analyze the
+ * behavior of the tp->ato delayed ack timeout interval.  When a
+ * connection starts up, we want to ack as quickly as possible.  The
+ * problem is that "good" TCP's do slow start at the beginning of data
+ * transmission.  The means that until we send the first few ACK's the
+ * sender will sit on his end and only queue most of his data, because
+ * he can only send snd_cwnd unacked packets at any given time.  For
+ * each ACK we send, he increments snd_cwnd and transmits more of his
+ * queue.  -DaveM
+ */
+static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
+{
+	u32 now;
+
+	tcp_schedule_ack(tp);
+
+	tcp_measure_rcv_mss(tp, skb);
+
+	tcp_rcv_rtt_measure(tp);
+	
+	now = tcp_time_stamp;
+
+	if (!tp->ack.ato) {
+		/* The _first_ data packet received, initialize
+		 * delayed ACK engine.
+		 */
+		tcp_incr_quickack(tp);
+		tp->ack.ato = TCP_ATO_MIN;
+	} else {
+		int m = now - tp->ack.lrcvtime;
+
+		if (m <= TCP_ATO_MIN/2) {
+			/* The fastest case is the first. */
+			tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;
+		} else if (m < tp->ack.ato) {
+			tp->ack.ato = (tp->ack.ato>>1) + m;
+			if (tp->ack.ato > tp->rto)
+				tp->ack.ato = tp->rto;
+		} else if (m > tp->rto) {
+			/* Too long gap. Apparently sender falled to
+			 * restart window, so that we send ACKs quickly.
+			 */
+			tcp_incr_quickack(tp);
+			sk_stream_mem_reclaim(sk);
+		}
+	}
+	tp->ack.lrcvtime = now;
+
+	TCP_ECN_check_ce(tp, skb);
+
+	if (skb->len >= 128)
+		tcp_grow_window(sk, tp, skb);
+}
+
+/* When starting a new connection, pin down the current choice of 
+ * congestion algorithm.
+ */
+void tcp_ca_init(struct tcp_sock *tp)
+{
+	if (sysctl_tcp_westwood) 
+		tp->adv_cong = TCP_WESTWOOD;
+	else if (sysctl_tcp_bic)
+		tp->adv_cong = TCP_BIC;
+	else if (sysctl_tcp_vegas_cong_avoid) {
+		tp->adv_cong = TCP_VEGAS;
+		tp->vegas.baseRTT = 0x7fffffff;
+		tcp_vegas_enable(tp);
+	} 
+}
+
+/* Do RTT sampling needed for Vegas.
+ * Basically we:
+ *   o min-filter RTT samples from within an RTT to get the current
+ *     propagation delay + queuing delay (we are min-filtering to try to
+ *     avoid the effects of delayed ACKs)
+ *   o min-filter RTT samples from a much longer window (forever for now)
+ *     to find the propagation delay (baseRTT)
+ */
+static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt)
+{
+	__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */
+
+	/* Filter to find propagation delay: */
+	if (vrtt < tp->vegas.baseRTT) 
+		tp->vegas.baseRTT = vrtt;
+
+	/* Find the min RTT during the last RTT to find
+	 * the current prop. delay + queuing delay:
+	 */
+	tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);
+	tp->vegas.cntRTT++;
+}
+
+/* Called to compute a smoothed rtt estimate. The data fed to this
+ * routine either comes from timestamps, or from segments that were
+ * known _not_ to have been retransmitted [see Karn/Partridge
+ * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88
+ * piece by Van Jacobson.
+ * NOTE: the next three routines used to be one big routine.
+ * To save cycles in the RFC 1323 implementation it was better to break
+ * it up into three procedures. -- erics
+ */
+static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt)
+{
+	long m = mrtt; /* RTT */
+
+	if (tcp_vegas_enabled(tp))
+		vegas_rtt_calc(tp, mrtt);
+
+	/*	The following amusing code comes from Jacobson's
+	 *	article in SIGCOMM '88.  Note that rtt and mdev
+	 *	are scaled versions of rtt and mean deviation.
+	 *	This is designed to be as fast as possible 
+	 *	m stands for "measurement".
+	 *
+	 *	On a 1990 paper the rto value is changed to:
+	 *	RTO = rtt + 4 * mdev
+	 *
+	 * Funny. This algorithm seems to be very broken.
+	 * These formulae increase RTO, when it should be decreased, increase
+	 * too slowly, when it should be incresed fastly, decrease too fastly
+	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely
+	 * does not matter how to _calculate_ it. Seems, it was trap
+	 * that VJ failed to avoid. 8)
+	 */
+	if(m == 0)
+		m = 1;
+	if (tp->srtt != 0) {
+		m -= (tp->srtt >> 3);	/* m is now error in rtt est */
+		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */
+		if (m < 0) {
+			m = -m;		/* m is now abs(error) */
+			m -= (tp->mdev >> 2);   /* similar update on mdev */
+			/* This is similar to one of Eifel findings.
+			 * Eifel blocks mdev updates when rtt decreases.
+			 * This solution is a bit different: we use finer gain
+			 * for mdev in this case (alpha*beta).
+			 * Like Eifel it also prevents growth of rto,
+			 * but also it limits too fast rto decreases,
+			 * happening in pure Eifel.
+			 */
+			if (m > 0)
+				m >>= 3;
+		} else {
+			m -= (tp->mdev >> 2);   /* similar update on mdev */
+		}
+		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */
+		if (tp->mdev > tp->mdev_max) {
+			tp->mdev_max = tp->mdev;
+			if (tp->mdev_max > tp->rttvar)
+				tp->rttvar = tp->mdev_max;
+		}
+		if (after(tp->snd_una, tp->rtt_seq)) {
+			if (tp->mdev_max < tp->rttvar)
+				tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
+			tp->rtt_seq = tp->snd_nxt;
+			tp->mdev_max = TCP_RTO_MIN;
+		}
+	} else {
+		/* no previous measure. */
+		tp->srtt = m<<3;	/* take the measured time to be rtt */
+		tp->mdev = m<<1;	/* make sure rto = 3*rtt */
+		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+		tp->rtt_seq = tp->snd_nxt;
+	}
+
+	tcp_westwood_update_rtt(tp, tp->srtt >> 3);
+}
+
+/* Calculate rto without backoff.  This is the second half of Van Jacobson's
+ * routine referred to above.
+ */
+static inline void tcp_set_rto(struct tcp_sock *tp)
+{
+	/* Old crap is replaced with new one. 8)
+	 *
+	 * More seriously:
+	 * 1. If rtt variance happened to be less 50msec, it is hallucination.
+	 *    It cannot be less due to utterly erratic ACK generation made
+	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_
+	 *    to do with delayed acks, because at cwnd>2 true delack timeout
+	 *    is invisible. Actually, Linux-2.4 also generates erratic
+	 *    ACKs in some curcumstances.
+	 */
+	tp->rto = (tp->srtt >> 3) + tp->rttvar;
+
+	/* 2. Fixups made earlier cannot be right.
+	 *    If we do not estimate RTO correctly without them,
+	 *    all the algo is pure shit and should be replaced
+	 *    with correct one. It is exaclty, which we pretend to do.
+	 */
+}
+
+/* NOTE: clamping at TCP_RTO_MIN is not required, current algo
+ * guarantees that rto is higher.
+ */
+static inline void tcp_bound_rto(struct tcp_sock *tp)
+{
+	if (tp->rto > TCP_RTO_MAX)
+		tp->rto = TCP_RTO_MAX;
+}
+
+/* Save metrics learned by this TCP session.
+   This function is called only, when TCP finishes successfully
+   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE.
+ */
+void tcp_update_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (sysctl_tcp_nometrics_save)
+		return;
+
+	dst_confirm(dst);
+
+	if (dst && (dst->flags&DST_HOST)) {
+		int m;
+
+		if (tp->backoff || !tp->srtt) {
+			/* This session failed to estimate rtt. Why?
+			 * Probably, no packets returned in time.
+			 * Reset our results.
+			 */
+			if (!(dst_metric_locked(dst, RTAX_RTT)))
+				dst->metrics[RTAX_RTT-1] = 0;
+			return;
+		}
+
+		m = dst_metric(dst, RTAX_RTT) - tp->srtt;
+
+		/* If newly calculated rtt larger than stored one,
+		 * store new one. Otherwise, use EWMA. Remember,
+		 * rtt overestimation is always better than underestimation.
+		 */
+		if (!(dst_metric_locked(dst, RTAX_RTT))) {
+			if (m <= 0)
+				dst->metrics[RTAX_RTT-1] = tp->srtt;
+			else
+				dst->metrics[RTAX_RTT-1] -= (m>>3);
+		}
+
+		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {
+			if (m < 0)
+				m = -m;
+
+			/* Scale deviation to rttvar fixed point */
+			m >>= 1;
+			if (m < tp->mdev)
+				m = tp->mdev;
+
+			if (m >= dst_metric(dst, RTAX_RTTVAR))
+				dst->metrics[RTAX_RTTVAR-1] = m;
+			else
+				dst->metrics[RTAX_RTTVAR-1] -=
+					(dst->metrics[RTAX_RTTVAR-1] - m)>>2;
+		}
+
+		if (tp->snd_ssthresh >= 0xFFFF) {
+			/* Slow start still did not finish. */
+			if (dst_metric(dst, RTAX_SSTHRESH) &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))
+				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;
+			if (!dst_metric_locked(dst, RTAX_CWND) &&
+			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))
+				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;
+		} else if (tp->snd_cwnd > tp->snd_ssthresh &&
+			   tp->ca_state == TCP_CA_Open) {
+			/* Cong. avoidance phase, cwnd is reliable. */
+			if (!dst_metric_locked(dst, RTAX_SSTHRESH))
+				dst->metrics[RTAX_SSTHRESH-1] =
+					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;
+		} else {
+			/* Else slow start did not finish, cwnd is non-sense,
+			   ssthresh may be also invalid.
+			 */
+			if (!dst_metric_locked(dst, RTAX_CWND))
+				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;
+			if (dst->metrics[RTAX_SSTHRESH-1] &&
+			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&
+			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])
+				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;
+		}
+
+		if (!dst_metric_locked(dst, RTAX_REORDERING)) {
+			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&
+			    tp->reordering != sysctl_tcp_reordering)
+				dst->metrics[RTAX_REORDERING-1] = tp->reordering;
+		}
+	}
+}
+
+/* Numbers are taken from RFC2414.  */
+__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
+{
+	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
+
+	if (!cwnd) {
+		if (tp->mss_cache_std > 1460)
+			cwnd = 2;
+		else
+			cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
+	}
+	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
+}
+
+/* Initialize metrics on socket. */
+
+static void tcp_init_metrics(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+
+	if (dst == NULL)
+		goto reset;
+
+	dst_confirm(dst);
+
+	if (dst_metric_locked(dst, RTAX_CWND))
+		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);
+	if (dst_metric(dst, RTAX_SSTHRESH)) {
+		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);
+		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
+			tp->snd_ssthresh = tp->snd_cwnd_clamp;
+	}
+	if (dst_metric(dst, RTAX_REORDERING) &&
+	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {
+		tp->rx_opt.sack_ok &= ~2;
+		tp->reordering = dst_metric(dst, RTAX_REORDERING);
+	}
+
+	if (dst_metric(dst, RTAX_RTT) == 0)
+		goto reset;
+
+	if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))
+		goto reset;
+
+	/* Initial rtt is determined from SYN,SYN-ACK.
+	 * The segment is small and rtt may appear much
+	 * less than real one. Use per-dst memory
+	 * to make it more realistic.
+	 *
+	 * A bit of theory. RTT is time passed after "normal" sized packet
+	 * is sent until it is ACKed. In normal curcumstances sending small
+	 * packets force peer to delay ACKs and calculation is correct too.
+	 * The algorithm is adaptive and, provided we follow specs, it
+	 * NEVER underestimate RTT. BUT! If peer tries to make some clever
+	 * tricks sort of "quick acks" for time long enough to decrease RTT
+	 * to low value, and then abruptly stops to do it and starts to delay
+	 * ACKs, wait for troubles.
+	 */
+	if (dst_metric(dst, RTAX_RTT) > tp->srtt) {
+		tp->srtt = dst_metric(dst, RTAX_RTT);
+		tp->rtt_seq = tp->snd_nxt;
+	}
+	if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {
+		tp->mdev = dst_metric(dst, RTAX_RTTVAR);
+		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+	}
+	tcp_set_rto(tp);
+	tcp_bound_rto(tp);
+	if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)
+		goto reset;
+	tp->snd_cwnd = tcp_init_cwnd(tp, dst);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	return;
+
+reset:
+	/* Play conservative. If timestamps are not
+	 * supported, TCP will fail to recalculate correct
+	 * rtt, if initial rto is too small. FORGET ALL AND RESET!
+	 */
+	if (!tp->rx_opt.saw_tstamp && tp->srtt) {
+		tp->srtt = 0;
+		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;
+		tp->rto = TCP_TIMEOUT_INIT;
+	}
+}
+
+static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts)
+{
+	if (metric > tp->reordering) {
+		tp->reordering = min(TCP_MAX_REORDERING, metric);
+
+		/* This exciting event is worth to be remembered. 8) */
+		if (ts)
+			NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
+		else if (IsReno(tp))
+			NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
+		else if (IsFack(tp))
+			NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
+		else
+			NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
+#if FASTRETRANS_DEBUG > 1
+		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",
+		       tp->rx_opt.sack_ok, tp->ca_state,
+		       tp->reordering,
+		       tp->fackets_out,
+		       tp->sacked_out,
+		       tp->undo_marker ? tp->undo_retrans : 0);
+#endif
+		/* Disable FACK yet. */
+		tp->rx_opt.sack_ok &= ~2;
+	}
+}
+
+/* This procedure tags the retransmission queue when SACKs arrive.
+ *
+ * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L).
+ * Packets in queue with these bits set are counted in variables
+ * sacked_out, retrans_out and lost_out, correspondingly.
+ *
+ * Valid combinations are:
+ * Tag  InFlight	Description
+ * 0	1		- orig segment is in flight.
+ * S	0		- nothing flies, orig reached receiver.
+ * L	0		- nothing flies, orig lost by net.
+ * R	2		- both orig and retransmit are in flight.
+ * L|R	1		- orig is lost, retransmit is in flight.
+ * S|R  1		- orig reached receiver, retrans is still in flight.
+ * (L|S|R is logically valid, it could occur when L|R is sacked,
+ *  but it is equivalent to plain S and code short-curcuits it to S.
+ *  L|S is logically invalid, it would mean -1 packet in flight 8))
+ *
+ * These 6 states form finite state machine, controlled by the following events:
+ * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue())
+ * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue())
+ * 3. Loss detection event of one of three flavors:
+ *	A. Scoreboard estimator decided the packet is lost.
+ *	   A'. Reno "three dupacks" marks head of queue lost.
+ *	   A''. Its FACK modfication, head until snd.fack is lost.
+ *	B. SACK arrives sacking data transmitted after never retransmitted
+ *	   hole was sent out.
+ *	C. SACK arrives sacking SND.NXT at the moment, when the
+ *	   segment was retransmitted.
+ * 4. D-SACK added new rule: D-SACK changes any tag to S.
+ *
+ * It is pleasant to note, that state diagram turns out to be commutative,
+ * so that we are allowed not to be bothered by order of our actions,
+ * when multiple events arrive simultaneously. (see the function below).
+ *
+ * Reordering detection.
+ * --------------------
+ * Reordering metric is maximal distance, which a packet can be displaced
+ * in packet stream. With SACKs we can estimate it:
+ *
+ * 1. SACK fills old hole and the corresponding segment was not
+ *    ever retransmitted -> reordering. Alas, we cannot use it
+ *    when segment was retransmitted.
+ * 2. The last flaw is solved with D-SACK. D-SACK arrives
+ *    for retransmitted and already SACKed segment -> reordering..
+ * Both of these heuristics are not used in Loss state, when we cannot
+ * account for retransmits accurately.
+ */
+static int
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
+	struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
+	int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
+	int reord = tp->packets_out;
+	int prior_fackets;
+	u32 lost_retrans = 0;
+	int flag = 0;
+	int i;
+
+	/* So, SACKs for already sent large segments will be lost.
+	 * Not good, but alternative is to resegment the queue. */
+	if (sk->sk_route_caps & NETIF_F_TSO) {
+		sk->sk_route_caps &= ~NETIF_F_TSO;
+		sock_set_flag(sk, SOCK_NO_LARGESEND);
+		tp->mss_cache = tp->mss_cache_std;
+	}
+
+	if (!tp->sacked_out)
+		tp->fackets_out = 0;
+	prior_fackets = tp->fackets_out;
+
+	for (i=0; i<num_sacks; i++, sp++) {
+		struct sk_buff *skb;
+		__u32 start_seq = ntohl(sp->start_seq);
+		__u32 end_seq = ntohl(sp->end_seq);
+		int fack_count = 0;
+		int dup_sack = 0;
+
+		/* Check for D-SACK. */
+		if (i == 0) {
+			u32 ack = TCP_SKB_CB(ack_skb)->ack_seq;
+
+			if (before(start_seq, ack)) {
+				dup_sack = 1;
+				tp->rx_opt.sack_ok |= 4;
+				NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
+			} else if (num_sacks > 1 &&
+				   !after(end_seq, ntohl(sp[1].end_seq)) &&
+				   !before(start_seq, ntohl(sp[1].start_seq))) {
+				dup_sack = 1;
+				tp->rx_opt.sack_ok |= 4;
+				NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
+			}
+
+			/* D-SACK for already forgotten data...
+			 * Do dumb counting. */
+			if (dup_sack &&
+			    !after(end_seq, prior_snd_una) &&
+			    after(end_seq, tp->undo_marker))
+				tp->undo_retrans--;
+
+			/* Eliminate too old ACKs, but take into
+			 * account more or less fresh ones, they can
+			 * contain valid SACK info.
+			 */
+			if (before(ack, prior_snd_una - tp->max_window))
+				return 0;
+		}
+
+		/* Event "B" in the comment above. */
+		if (after(end_seq, tp->high_seq))
+			flag |= FLAG_DATA_LOST;
+
+		sk_stream_for_retrans_queue(skb, sk) {
+			u8 sacked = TCP_SKB_CB(skb)->sacked;
+			int in_sack;
+
+			/* The retransmission queue is always in order, so
+			 * we can short-circuit the walk early.
+			 */
+			if(!before(TCP_SKB_CB(skb)->seq, end_seq))
+				break;
+
+			fack_count += tcp_skb_pcount(skb);
+
+			in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&
+				!before(end_seq, TCP_SKB_CB(skb)->end_seq);
+
+			/* Account D-SACK for retransmitted packet. */
+			if ((dup_sack && in_sack) &&
+			    (sacked & TCPCB_RETRANS) &&
+			    after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker))
+				tp->undo_retrans--;
+
+			/* The frame is ACKed. */
+			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) {
+				if (sacked&TCPCB_RETRANS) {
+					if ((dup_sack && in_sack) &&
+					    (sacked&TCPCB_SACKED_ACKED))
+						reord = min(fack_count, reord);
+				} else {
+					/* If it was in a hole, we detected reordering. */
+					if (fack_count < prior_fackets &&
+					    !(sacked&TCPCB_SACKED_ACKED))
+						reord = min(fack_count, reord);
+				}
+
+				/* Nothing to do; acked frame is about to be dropped. */
+				continue;
+			}
+
+			if ((sacked&TCPCB_SACKED_RETRANS) &&
+			    after(end_seq, TCP_SKB_CB(skb)->ack_seq) &&
+			    (!lost_retrans || after(end_seq, lost_retrans)))
+				lost_retrans = end_seq;
+
+			if (!in_sack)
+				continue;
+
+			if (!(sacked&TCPCB_SACKED_ACKED)) {
+				if (sacked & TCPCB_SACKED_RETRANS) {
+					/* If the segment is not tagged as lost,
+					 * we do not clear RETRANS, believing
+					 * that retransmission is still in flight.
+					 */
+					if (sacked & TCPCB_LOST) {
+						TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+						tp->lost_out -= tcp_skb_pcount(skb);
+						tp->retrans_out -= tcp_skb_pcount(skb);
+					}
+				} else {
+					/* New sack for not retransmitted frame,
+					 * which was in hole. It is reordering.
+					 */
+					if (!(sacked & TCPCB_RETRANS) &&
+					    fack_count < prior_fackets)
+						reord = min(fack_count, reord);
+
+					if (sacked & TCPCB_LOST) {
+						TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+						tp->lost_out -= tcp_skb_pcount(skb);
+					}
+				}
+
+				TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
+				flag |= FLAG_DATA_SACKED;
+				tp->sacked_out += tcp_skb_pcount(skb);
+
+				if (fack_count > tp->fackets_out)
+					tp->fackets_out = fack_count;
+			} else {
+				if (dup_sack && (sacked&TCPCB_RETRANS))
+					reord = min(fack_count, reord);
+			}
+
+			/* D-SACK. We can detect redundant retransmission
+			 * in S|R and plain R frames and clear it.
+			 * undo_retrans is decreased above, L|R frames
+			 * are accounted above as well.
+			 */
+			if (dup_sack &&
+			    (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) {
+				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+				tp->retrans_out -= tcp_skb_pcount(skb);
+			}
+		}
+	}
+
+	/* Check for lost retransmit. This superb idea is
+	 * borrowed from "ratehalving". Event "C".
+	 * Later note: FACK people cheated me again 8),
+	 * we have to account for reordering! Ugly,
+	 * but should help.
+	 */
+	if (lost_retrans && tp->ca_state == TCP_CA_Recovery) {
+		struct sk_buff *skb;
+
+		sk_stream_for_retrans_queue(skb, sk) {
+			if (after(TCP_SKB_CB(skb)->seq, lost_retrans))
+				break;
+			if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+				continue;
+			if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) &&
+			    after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) &&
+			    (IsFack(tp) ||
+			     !before(lost_retrans,
+				     TCP_SKB_CB(skb)->ack_seq + tp->reordering *
+				     tp->mss_cache_std))) {
+				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+				tp->retrans_out -= tcp_skb_pcount(skb);
+
+				if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) {
+					tp->lost_out += tcp_skb_pcount(skb);
+					TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+					flag |= FLAG_DATA_SACKED;
+					NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);
+				}
+			}
+		}
+	}
+
+	tp->left_out = tp->sacked_out + tp->lost_out;
+
+	if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss)
+		tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0);
+
+#if FASTRETRANS_DEBUG > 0
+	BUG_TRAP((int)tp->sacked_out >= 0);
+	BUG_TRAP((int)tp->lost_out >= 0);
+	BUG_TRAP((int)tp->retrans_out >= 0);
+	BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0);
+#endif
+	return flag;
+}
+
+/* RTO occurred, but do not yet enter loss state. Instead, transmit two new
+ * segments to see from the next ACKs whether any data was really missing.
+ * If the RTO was spurious, new ACKs should arrive.
+ */
+void tcp_enter_frto(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	tp->frto_counter = 1;
+
+	if (tp->ca_state <= TCP_CA_Disorder ||
+            tp->snd_una == tp->high_seq ||
+            (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(tp);
+		if (!tcp_westwood_ssthresh(tp))
+			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+	}
+
+	/* Have to clear retransmission markers here to keep the bookkeeping
+	 * in shape, even though we are not yet in Loss state.
+	 * If something was really lost, it is eventually caught up
+	 * in tcp_enter_frto_loss.
+	 */
+	tp->retrans_out = 0;
+	tp->undo_marker = tp->snd_una;
+	tp->undo_retrans = 0;
+
+	sk_stream_for_retrans_queue(skb, sk) {
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS;
+	}
+	tcp_sync_left_out(tp);
+
+	tcp_set_ca_state(tp, TCP_CA_Open);
+	tp->frto_highmark = tp->snd_nxt;
+}
+
+/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
+ * which indicates that we should follow the traditional RTO recovery,
+ * i.e. mark everything lost and do go-back-N retransmission.
+ */
+static void tcp_enter_frto_loss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int cnt = 0;
+
+	tp->sacked_out = 0;
+	tp->lost_out = 0;
+	tp->fackets_out = 0;
+
+	sk_stream_for_retrans_queue(skb, sk) {
+		cnt += tcp_skb_pcount(skb);
+		TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+
+			/* Do not mark those segments lost that were
+			 * forward transmitted after RTO
+			 */
+			if (!after(TCP_SKB_CB(skb)->end_seq,
+				   tp->frto_highmark)) {
+				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+				tp->lost_out += tcp_skb_pcount(skb);
+			}
+		} else {
+			tp->sacked_out += tcp_skb_pcount(skb);
+			tp->fackets_out = cnt;
+		}
+	}
+	tcp_sync_left_out(tp);
+
+	tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1;
+	tp->snd_cwnd_cnt = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->undo_marker = 0;
+	tp->frto_counter = 0;
+
+	tp->reordering = min_t(unsigned int, tp->reordering,
+					     sysctl_tcp_reordering);
+	tcp_set_ca_state(tp, TCP_CA_Loss);
+	tp->high_seq = tp->frto_highmark;
+	TCP_ECN_queue_cwr(tp);
+
+	init_bictcp(tp);
+}
+
+void tcp_clear_retrans(struct tcp_sock *tp)
+{
+	tp->left_out = 0;
+	tp->retrans_out = 0;
+
+	tp->fackets_out = 0;
+	tp->sacked_out = 0;
+	tp->lost_out = 0;
+
+	tp->undo_marker = 0;
+	tp->undo_retrans = 0;
+}
+
+/* Enter Loss state. If "how" is not zero, forget all SACK information
+ * and reset tags completely, otherwise preserve SACKs. If receiver
+ * dropped its ofo queue, we will know this due to reneging detection.
+ */
+void tcp_enter_loss(struct sock *sk, int how)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int cnt = 0;
+
+	/* Reduce ssthresh if it has not yet been made inside this window. */
+	if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+	    (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) {
+		tp->prior_ssthresh = tcp_current_ssthresh(tp);
+		tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+	}
+	tp->snd_cwnd	   = 1;
+	tp->snd_cwnd_cnt   = 0;
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+
+	tcp_clear_retrans(tp);
+
+	/* Push undo marker, if it was plain RTO and nothing
+	 * was retransmitted. */
+	if (!how)
+		tp->undo_marker = tp->snd_una;
+
+	sk_stream_for_retrans_queue(skb, sk) {
+		cnt += tcp_skb_pcount(skb);
+		if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS)
+			tp->undo_marker = 0;
+		TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED;
+		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) {
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED;
+			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+			tp->lost_out += tcp_skb_pcount(skb);
+		} else {
+			tp->sacked_out += tcp_skb_pcount(skb);
+			tp->fackets_out = cnt;
+		}
+	}
+	tcp_sync_left_out(tp);
+
+	tp->reordering = min_t(unsigned int, tp->reordering,
+					     sysctl_tcp_reordering);
+	tcp_set_ca_state(tp, TCP_CA_Loss);
+	tp->high_seq = tp->snd_nxt;
+	TCP_ECN_queue_cwr(tp);
+}
+
+static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp)
+{
+	struct sk_buff *skb;
+
+	/* If ACK arrived pointing to a remembered SACK,
+	 * it means that our remembered SACKs do not reflect
+	 * real state of receiver i.e.
+	 * receiver _host_ is heavily congested (or buggy).
+	 * Do processing similar to RTO timeout.
+	 */
+	if ((skb = skb_peek(&sk->sk_write_queue)) != NULL &&
+	    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+		NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING);
+
+		tcp_enter_loss(sk, 1);
+		tp->retransmits++;
+		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int tcp_fackets_out(struct tcp_sock *tp)
+{
+	return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out;
+}
+
+static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);
+}
+
+static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp)
+{
+	return tp->packets_out &&
+	       tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));
+}
+
+/* Linux NewReno/SACK/FACK/ECN state machine.
+ * --------------------------------------
+ *
+ * "Open"	Normal state, no dubious events, fast path.
+ * "Disorder"   In all the respects it is "Open",
+ *		but requires a bit more attention. It is entered when
+ *		we see some SACKs or dupacks. It is split of "Open"
+ *		mainly to move some processing from fast path to slow one.
+ * "CWR"	CWND was reduced due to some Congestion Notification event.
+ *		It can be ECN, ICMP source quench, local device congestion.
+ * "Recovery"	CWND was reduced, we are fast-retransmitting.
+ * "Loss"	CWND was reduced due to RTO timeout or SACK reneging.
+ *
+ * tcp_fastretrans_alert() is entered:
+ * - each incoming ACK, if state is not "Open"
+ * - when arrived ACK is unusual, namely:
+ *	* SACK
+ *	* Duplicate ACK.
+ *	* ECN ECE.
+ *
+ * Counting packets in flight is pretty simple.
+ *
+ *	in_flight = packets_out - left_out + retrans_out
+ *
+ *	packets_out is SND.NXT-SND.UNA counted in packets.
+ *
+ *	retrans_out is number of retransmitted segments.
+ *
+ *	left_out is number of segments left network, but not ACKed yet.
+ *
+ *		left_out = sacked_out + lost_out
+ *
+ *     sacked_out: Packets, which arrived to receiver out of order
+ *		   and hence not ACKed. With SACKs this number is simply
+ *		   amount of SACKed data. Even without SACKs
+ *		   it is easy to give pretty reliable estimate of this number,
+ *		   counting duplicate ACKs.
+ *
+ *       lost_out: Packets lost by network. TCP has no explicit
+ *		   "loss notification" feedback from network (for now).
+ *		   It means that this number can be only _guessed_.
+ *		   Actually, it is the heuristics to predict lossage that
+ *		   distinguishes different algorithms.
+ *
+ *	F.e. after RTO, when all the queue is considered as lost,
+ *	lost_out = packets_out and in_flight = retrans_out.
+ *
+ *		Essentially, we have now two algorithms counting
+ *		lost packets.
+ *
+ *		FACK: It is the simplest heuristics. As soon as we decided
+ *		that something is lost, we decide that _all_ not SACKed
+ *		packets until the most forward SACK are lost. I.e.
+ *		lost_out = fackets_out - sacked_out and left_out = fackets_out.
+ *		It is absolutely correct estimate, if network does not reorder
+ *		packets. And it loses any connection to reality when reordering
+ *		takes place. We use FACK by default until reordering
+ *		is suspected on the path to this destination.
+ *
+ *		NewReno: when Recovery is entered, we assume that one segment
+ *		is lost (classic Reno). While we are in Recovery and
+ *		a partial ACK arrives, we assume that one more packet
+ *		is lost (NewReno). This heuristics are the same in NewReno
+ *		and SACK.
+ *
+ *  Imagine, that's all! Forget about all this shamanism about CWND inflation
+ *  deflation etc. CWND is real congestion window, never inflated, changes
+ *  only according to classic VJ rules.
+ *
+ * Really tricky (and requiring careful tuning) part of algorithm
+ * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue().
+ * The first determines the moment _when_ we should reduce CWND and,
+ * hence, slow down forward transmission. In fact, it determines the moment
+ * when we decide that hole is caused by loss, rather than by a reorder.
+ *
+ * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill
+ * holes, caused by lost packets.
+ *
+ * And the most logically complicated part of algorithm is undo
+ * heuristics. We detect false retransmits due to both too early
+ * fast retransmit (reordering) and underestimated RTO, analyzing
+ * timestamps and D-SACKs. When we detect that some segments were
+ * retransmitted by mistake and CWND reduction was wrong, we undo
+ * window reduction and abort recovery phase. This logic is hidden
+ * inside several functions named tcp_try_undo_<something>.
+ */
+
+/* This function decides, when we should leave Disordered state
+ * and enter Recovery phase, reducing congestion window.
+ *
+ * Main question: may we further continue forward transmission
+ * with the same cwnd?
+ */
+static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp)
+{
+	__u32 packets_out;
+
+	/* Trick#1: The loss is proven. */
+	if (tp->lost_out)
+		return 1;
+
+	/* Not-A-Trick#2 : Classic rule... */
+	if (tcp_fackets_out(tp) > tp->reordering)
+		return 1;
+
+	/* Trick#3 : when we use RFC2988 timer restart, fast
+	 * retransmit can be triggered by timeout of queue head.
+	 */
+	if (tcp_head_timedout(sk, tp))
+		return 1;
+
+	/* Trick#4: It is still not OK... But will it be useful to delay
+	 * recovery more?
+	 */
+	packets_out = tp->packets_out;
+	if (packets_out <= tp->reordering &&
+	    tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+	    !tcp_may_send_now(sk, tp)) {
+		/* We have nothing to send. This connection is limited
+		 * either by receiver window or by application.
+		 */
+		return 1;
+	}
+
+	return 0;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend)
+{
+	u32 holes;
+
+	holes = max(tp->lost_out, 1U);
+	holes = min(holes, tp->packets_out);
+
+	if ((tp->sacked_out + holes) > tp->packets_out) {
+		tp->sacked_out = tp->packets_out - holes;
+		tcp_update_reordering(tp, tp->packets_out+addend, 0);
+	}
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct tcp_sock *tp)
+{
+	tp->sacked_out++;
+	tcp_check_reno_reordering(tp, 0);
+	tcp_sync_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked)
+{
+	if (acked > 0) {
+		/* One ACK acked hole. The rest eat duplicate ACKs. */
+		if (acked-1 >= tp->sacked_out)
+			tp->sacked_out = 0;
+		else
+			tp->sacked_out -= acked-1;
+	}
+	tcp_check_reno_reordering(tp, acked);
+	tcp_sync_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+	tp->sacked_out = 0;
+	tp->left_out = tp->lost_out;
+}
+
+/* Mark head of queue up as lost. */
+static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
+			       int packets, u32 high_seq)
+{
+	struct sk_buff *skb;
+	int cnt = packets;
+
+	BUG_TRAP(cnt <= tp->packets_out);
+
+	sk_stream_for_retrans_queue(skb, sk) {
+		cnt -= tcp_skb_pcount(skb);
+		if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+			break;
+		if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+			tp->lost_out += tcp_skb_pcount(skb);
+		}
+	}
+	tcp_sync_left_out(tp);
+}
+
+/* Account newly detected lost packet(s) */
+
+static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+{
+	if (IsFack(tp)) {
+		int lost = tp->fackets_out - tp->reordering;
+		if (lost <= 0)
+			lost = 1;
+		tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
+	} else {
+		tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
+	}
+
+	/* New heuristics: it is possible only after we switched
+	 * to restart timer each time when something is ACKed.
+	 * Hence, we can detect timed out packets during fast
+	 * retransmit without falling to slow start.
+	 */
+	if (tcp_head_timedout(sk, tp)) {
+		struct sk_buff *skb;
+
+		sk_stream_for_retrans_queue(skb, sk) {
+			if (tcp_skb_timedout(tp, skb) &&
+			    !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+				tp->lost_out += tcp_skb_pcount(skb);
+			}
+		}
+		tcp_sync_left_out(tp);
+	}
+}
+
+/* CWND moderation, preventing bursts due to too big ACKs
+ * in dubious situations.
+ */
+static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
+{
+	tp->snd_cwnd = min(tp->snd_cwnd,
+			   tcp_packets_in_flight(tp)+tcp_max_burst(tp));
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Decrease cwnd each second ack. */
+
+static void tcp_cwnd_down(struct tcp_sock *tp)
+{
+	int decr = tp->snd_cwnd_cnt + 1;
+	__u32 limit;
+
+	/*
+	 * TCP Westwood
+	 * Here limit is evaluated as BWestimation*RTTmin (for obtaining it
+	 * in packets we use mss_cache). If sysctl_tcp_westwood is off
+	 * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is
+	 * still used as usual. It prevents other strange cases in which
+	 * BWE*RTTmin could assume value 0. It should not happen but...
+	 */
+
+	if (!(limit = tcp_westwood_bw_rttmin(tp)))
+		limit = tp->snd_ssthresh/2;
+
+	tp->snd_cwnd_cnt = decr&1;
+	decr >>= 1;
+
+	if (decr && tp->snd_cwnd > limit)
+		tp->snd_cwnd -= decr;
+
+	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* Nothing was retransmitted or returned timestamp is less
+ * than timestamp of the first retransmission.
+ */
+static inline int tcp_packet_delayed(struct tcp_sock *tp)
+{
+	return !tp->retrans_stamp ||
+		(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+		 (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
+}
+
+/* Undo procedures. */
+
+#if FASTRETRANS_DEBUG > 1
+static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
+	       msg,
+	       NIPQUAD(inet->daddr), ntohs(inet->dport),
+	       tp->snd_cwnd, tp->left_out,
+	       tp->snd_ssthresh, tp->prior_ssthresh,
+	       tp->packets_out);
+}
+#else
+#define DBGUNDO(x...) do { } while (0)
+#endif
+
+static void tcp_undo_cwr(struct tcp_sock *tp, int undo)
+{
+	if (tp->prior_ssthresh) {
+		if (tcp_is_bic(tp))
+			tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd);
+		else
+			tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1);
+
+		if (undo && tp->prior_ssthresh > tp->snd_ssthresh) {
+			tp->snd_ssthresh = tp->prior_ssthresh;
+			TCP_ECN_withdraw_cwr(tp);
+		}
+	} else {
+		tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
+	}
+	tcp_moderate_cwnd(tp);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static inline int tcp_may_undo(struct tcp_sock *tp)
+{
+	return tp->undo_marker &&
+		(!tp->undo_retrans || tcp_packet_delayed(tp));
+}
+
+/* People celebrate: "We love our President!" */
+static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp)
+{
+	if (tcp_may_undo(tp)) {
+		/* Happy end! We did not retransmit anything
+		 * or our original transmission succeeded.
+		 */
+		DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans");
+		tcp_undo_cwr(tp, 1);
+		if (tp->ca_state == TCP_CA_Loss)
+			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
+		else
+			NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO);
+		tp->undo_marker = 0;
+	}
+	if (tp->snd_una == tp->high_seq && IsReno(tp)) {
+		/* Hold old state until something *above* high_seq
+		 * is ACKed. For Reno it is MUST to prevent false
+		 * fast retransmits (RFC2582). SACK TCP is safe. */
+		tcp_moderate_cwnd(tp);
+		return 1;
+	}
+	tcp_set_ca_state(tp, TCP_CA_Open);
+	return 0;
+}
+
+/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */
+static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp)
+{
+	if (tp->undo_marker && !tp->undo_retrans) {
+		DBGUNDO(sk, tp, "D-SACK");
+		tcp_undo_cwr(tp, 1);
+		tp->undo_marker = 0;
+		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);
+	}
+}
+
+/* Undo during fast recovery after partial ACK. */
+
+static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp,
+				int acked)
+{
+	/* Partial ACK arrived. Force Hoe's retransmit. */
+	int failed = IsReno(tp) || tp->fackets_out>tp->reordering;
+
+	if (tcp_may_undo(tp)) {
+		/* Plain luck! Hole if filled with delayed
+		 * packet, rather than with a retransmit.
+		 */
+		if (tp->retrans_out == 0)
+			tp->retrans_stamp = 0;
+
+		tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);
+
+		DBGUNDO(sk, tp, "Hoe");
+		tcp_undo_cwr(tp, 0);
+		NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);
+
+		/* So... Do not make Hoe's retransmit yet.
+		 * If the first packet was delayed, the rest
+		 * ones are most probably delayed as well.
+		 */
+		failed = 0;
+	}
+	return failed;
+}
+
+/* Undo during loss recovery after partial ACK. */
+static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp)
+{
+	if (tcp_may_undo(tp)) {
+		struct sk_buff *skb;
+		sk_stream_for_retrans_queue(skb, sk) {
+			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+		}
+		DBGUNDO(sk, tp, "partial loss");
+		tp->lost_out = 0;
+		tp->left_out = tp->sacked_out;
+		tcp_undo_cwr(tp, 1);
+		NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
+		tp->retransmits = 0;
+		tp->undo_marker = 0;
+		if (!IsReno(tp))
+			tcp_set_ca_state(tp, TCP_CA_Open);
+		return 1;
+	}
+	return 0;
+}
+
+static inline void tcp_complete_cwr(struct tcp_sock *tp)
+{
+	if (tcp_westwood_cwnd(tp)) 
+		tp->snd_ssthresh = tp->snd_cwnd;
+	else
+		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
+{
+	tp->left_out = tp->sacked_out;
+
+	if (tp->retrans_out == 0)
+		tp->retrans_stamp = 0;
+
+	if (flag&FLAG_ECE)
+		tcp_enter_cwr(tp);
+
+	if (tp->ca_state != TCP_CA_CWR) {
+		int state = TCP_CA_Open;
+
+		if (tp->left_out || tp->retrans_out || tp->undo_marker)
+			state = TCP_CA_Disorder;
+
+		if (tp->ca_state != state) {
+			tcp_set_ca_state(tp, state);
+			tp->high_seq = tp->snd_nxt;
+		}
+		tcp_moderate_cwnd(tp);
+	} else {
+		tcp_cwnd_down(tp);
+	}
+}
+
+/* Process an event, which can update packets-in-flight not trivially.
+ * Main goal of this function is to calculate new estimate for left_out,
+ * taking into account both packets sitting in receiver's buffer and
+ * packets lost by network.
+ *
+ * Besides that it does CWND reduction, when packet loss is detected
+ * and changes state of machine.
+ *
+ * It does _not_ decide what to send, it is made in function
+ * tcp_xmit_retransmit_queue().
+ */
+static void
+tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
+		      int prior_packets, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));
+
+	/* Some technical things:
+	 * 1. Reno does not count dupacks (sacked_out) automatically. */
+	if (!tp->packets_out)
+		tp->sacked_out = 0;
+        /* 2. SACK counts snd_fack in packets inaccurately. */
+	if (tp->sacked_out == 0)
+		tp->fackets_out = 0;
+
+        /* Now state machine starts.
+	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+	if (flag&FLAG_ECE)
+		tp->prior_ssthresh = 0;
+
+	/* B. In all the states check for reneging SACKs. */
+	if (tp->sacked_out && tcp_check_sack_reneging(sk, tp))
+		return;
+
+	/* C. Process data loss notification, provided it is valid. */
+	if ((flag&FLAG_DATA_LOST) &&
+	    before(tp->snd_una, tp->high_seq) &&
+	    tp->ca_state != TCP_CA_Open &&
+	    tp->fackets_out > tp->reordering) {
+		tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
+		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
+	}
+
+	/* D. Synchronize left_out to current state. */
+	tcp_sync_left_out(tp);
+
+	/* E. Check state exit conditions. State can be terminated
+	 *    when high_seq is ACKed. */
+	if (tp->ca_state == TCP_CA_Open) {
+		if (!sysctl_tcp_frto)
+			BUG_TRAP(tp->retrans_out == 0);
+		tp->retrans_stamp = 0;
+	} else if (!before(tp->snd_una, tp->high_seq)) {
+		switch (tp->ca_state) {
+		case TCP_CA_Loss:
+			tp->retransmits = 0;
+			if (tcp_try_undo_recovery(sk, tp))
+				return;
+			break;
+
+		case TCP_CA_CWR:
+			/* CWR is to be held something *above* high_seq
+			 * is ACKed for CWR bit to reach receiver. */
+			if (tp->snd_una != tp->high_seq) {
+				tcp_complete_cwr(tp);
+				tcp_set_ca_state(tp, TCP_CA_Open);
+			}
+			break;
+
+		case TCP_CA_Disorder:
+			tcp_try_undo_dsack(sk, tp);
+			if (!tp->undo_marker ||
+			    /* For SACK case do not Open to allow to undo
+			     * catching for all duplicate ACKs. */
+			    IsReno(tp) || tp->snd_una != tp->high_seq) {
+				tp->undo_marker = 0;
+				tcp_set_ca_state(tp, TCP_CA_Open);
+			}
+			break;
+
+		case TCP_CA_Recovery:
+			if (IsReno(tp))
+				tcp_reset_reno_sack(tp);
+			if (tcp_try_undo_recovery(sk, tp))
+				return;
+			tcp_complete_cwr(tp);
+			break;
+		}
+	}
+
+	/* F. Process state. */
+	switch (tp->ca_state) {
+	case TCP_CA_Recovery:
+		if (prior_snd_una == tp->snd_una) {
+			if (IsReno(tp) && is_dupack)
+				tcp_add_reno_sack(tp);
+		} else {
+			int acked = prior_packets - tp->packets_out;
+			if (IsReno(tp))
+				tcp_remove_reno_sacks(sk, tp, acked);
+			is_dupack = tcp_try_undo_partial(sk, tp, acked);
+		}
+		break;
+	case TCP_CA_Loss:
+		if (flag&FLAG_DATA_ACKED)
+			tp->retransmits = 0;
+		if (!tcp_try_undo_loss(sk, tp)) {
+			tcp_moderate_cwnd(tp);
+			tcp_xmit_retransmit_queue(sk);
+			return;
+		}
+		if (tp->ca_state != TCP_CA_Open)
+			return;
+		/* Loss is undone; fall through to processing in Open state. */
+	default:
+		if (IsReno(tp)) {
+			if (tp->snd_una != prior_snd_una)
+				tcp_reset_reno_sack(tp);
+			if (is_dupack)
+				tcp_add_reno_sack(tp);
+		}
+
+		if (tp->ca_state == TCP_CA_Disorder)
+			tcp_try_undo_dsack(sk, tp);
+
+		if (!tcp_time_to_recover(sk, tp)) {
+			tcp_try_to_open(sk, tp, flag);
+			return;
+		}
+
+		/* Otherwise enter Recovery state */
+
+		if (IsReno(tp))
+			NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);
+		else
+			NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);
+
+		tp->high_seq = tp->snd_nxt;
+		tp->prior_ssthresh = 0;
+		tp->undo_marker = tp->snd_una;
+		tp->undo_retrans = tp->retrans_out;
+
+		if (tp->ca_state < TCP_CA_CWR) {
+			if (!(flag&FLAG_ECE))
+				tp->prior_ssthresh = tcp_current_ssthresh(tp);
+			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);
+			TCP_ECN_queue_cwr(tp);
+		}
+
+		tp->snd_cwnd_cnt = 0;
+		tcp_set_ca_state(tp, TCP_CA_Recovery);
+	}
+
+	if (is_dupack || tcp_head_timedout(sk, tp))
+		tcp_update_scoreboard(sk, tp);
+	tcp_cwnd_down(tp);
+	tcp_xmit_retransmit_queue(sk);
+}
+
+/* Read draft-ietf-tcplw-high-performance before mucking
+ * with this code. (Superceeds RFC1323)
+ */
+static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag)
+{
+	__u32 seq_rtt;
+
+	/* RTTM Rule: A TSecr value received in a segment is used to
+	 * update the averaged RTT measurement only if the segment
+	 * acknowledges some new data, i.e., only if it advances the
+	 * left edge of the send window.
+	 *
+	 * See draft-ietf-tcplw-high-performance-00, section 3.3.
+	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>
+	 *
+	 * Changed: reset backoff as soon as we see the first valid sample.
+	 * If we do not, we get strongly overstimated rto. With timestamps
+	 * samples are accepted even from very old segments: f.e., when rtt=1
+	 * increases to 8, we retransmit 5 times and after 8 seconds delayed
+	 * answer arrives rto becomes 120 seconds! If at least one of segments
+	 * in window is lost... Voila.	 			--ANK (010210)
+	 */
+	seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
+	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_set_rto(tp);
+	tp->backoff = 0;
+	tcp_bound_rto(tp);
+}
+
+static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag)
+{
+	/* We don't have a timestamp. Can only use
+	 * packets that are not retransmitted to determine
+	 * rtt estimates. Also, we must not reset the
+	 * backoff for rto until we get a non-retransmitted
+	 * packet. This allows us to deal with a situation
+	 * where the network delay has increased suddenly.
+	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)
+	 */
+
+	if (flag & FLAG_RETRANS_DATA_ACKED)
+		return;
+
+	tcp_rtt_estimator(tp, seq_rtt);
+	tcp_set_rto(tp);
+	tp->backoff = 0;
+	tcp_bound_rto(tp);
+}
+
+static inline void tcp_ack_update_rtt(struct tcp_sock *tp,
+				      int flag, s32 seq_rtt)
+{
+	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */
+	if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
+		tcp_ack_saw_tstamp(tp, flag);
+	else if (seq_rtt >= 0)
+		tcp_ack_no_tstamp(tp, seq_rtt, flag);
+}
+
+/*
+ * Compute congestion window to use.
+ *
+ * This is from the implementation of BICTCP in
+ * Lison-Xu, Kahaled Harfoush, and Injog Rhee.
+ *  "Binary Increase Congestion Control for Fast, Long Distance
+ *  Networks" in InfoComm 2004
+ * Available from:
+ *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf
+ *
+ * Unless BIC is enabled and congestion window is large
+ * this behaves the same as the original Reno.
+ */
+static inline __u32 bictcp_cwnd(struct tcp_sock *tp)
+{
+	/* orignal Reno behaviour */
+	if (!tcp_is_bic(tp))
+		return tp->snd_cwnd;
+
+	if (tp->bictcp.last_cwnd == tp->snd_cwnd &&
+	   (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))
+		return tp->bictcp.cnt;
+
+	tp->bictcp.last_cwnd = tp->snd_cwnd;
+	tp->bictcp.last_stamp = tcp_time_stamp;
+      
+	/* start off normal */
+	if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)
+		tp->bictcp.cnt = tp->snd_cwnd;
+
+	/* binary increase */
+	else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {
+		__u32 	dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)
+			/ BICTCP_B;
+
+		if (dist > BICTCP_MAX_INCREMENT)
+			/* linear increase */
+			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
+		else if (dist <= 1U)
+			/* binary search increase */
+			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
+				/ BICTCP_B;
+		else
+			/* binary search increase */
+			tp->bictcp.cnt = tp->snd_cwnd / dist;
+	} else {
+		/* slow start amd linear increase */
+		if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)
+			/* slow start */
+			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR
+				/ BICTCP_B;
+		else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd
+			 		+ BICTCP_MAX_INCREMENT*(BICTCP_B-1))
+			/* slow start */
+			tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)
+				/ (tp->snd_cwnd-tp->bictcp.last_max_cwnd);
+		else
+			/* linear increase */
+			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;
+	}
+	return tp->bictcp.cnt;
+}
+
+/* This is Jacobson's slow start and congestion avoidance. 
+ * SIGCOMM '88, p. 328.
+ */
+static inline void reno_cong_avoid(struct tcp_sock *tp)
+{
+        if (tp->snd_cwnd <= tp->snd_ssthresh) {
+                /* In "safe" area, increase. */
+		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+			tp->snd_cwnd++;
+	} else {
+                /* In dangerous area, increase slowly.
+		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
+		 */
+		if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {
+			if (tp->snd_cwnd < tp->snd_cwnd_clamp)
+				tp->snd_cwnd++;
+			tp->snd_cwnd_cnt=0;
+		} else
+			tp->snd_cwnd_cnt++;
+        }
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+/* This is based on the congestion detection/avoidance scheme described in
+ *    Lawrence S. Brakmo and Larry L. Peterson.
+ *    "TCP Vegas: End to end congestion avoidance on a global internet."
+ *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480,
+ *    October 1995. Available from:
+ *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps
+ *
+ * See http://www.cs.arizona.edu/xkernel/ for their implementation.
+ * The main aspects that distinguish this implementation from the
+ * Arizona Vegas implementation are:
+ *   o We do not change the loss detection or recovery mechanisms of
+ *     Linux in any way. Linux already recovers from losses quite well,
+ *     using fine-grained timers, NewReno, and FACK.
+ *   o To avoid the performance penalty imposed by increasing cwnd
+ *     only every-other RTT during slow start, we increase during
+ *     every RTT during slow start, just like Reno.
+ *   o Largely to allow continuous cwnd growth during slow start,
+ *     we use the rate at which ACKs come back as the "actual"
+ *     rate, rather than the rate at which data is sent.
+ *   o To speed convergence to the right rate, we set the cwnd
+ *     to achieve the right ("actual") rate when we exit slow start.
+ *   o To filter out the noise caused by delayed ACKs, we use the
+ *     minimum RTT sample observed during the last RTT to calculate
+ *     the actual rate.
+ *   o When the sender re-starts from idle, it waits until it has
+ *     received ACKs for an entire flight of new data before making
+ *     a cwnd adjustment decision. The original Vegas implementation
+ *     assumed senders never went idle.
+ */
+static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+{
+	/* The key players are v_beg_snd_una and v_beg_snd_nxt.
+	 *
+	 * These are so named because they represent the approximate values
+	 * of snd_una and snd_nxt at the beginning of the current RTT. More
+	 * precisely, they represent the amount of data sent during the RTT.
+	 * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt,
+	 * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding
+	 * bytes of data have been ACKed during the course of the RTT, giving
+	 * an "actual" rate of:
+	 *
+	 *     (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration)
+	 *
+	 * Unfortunately, v_beg_snd_una is not exactly equal to snd_una,
+	 * because delayed ACKs can cover more than one segment, so they
+	 * don't line up nicely with the boundaries of RTTs.
+	 *
+	 * Another unfortunate fact of life is that delayed ACKs delay the
+	 * advance of the left edge of our send window, so that the number
+	 * of bytes we send in an RTT is often less than our cwnd will allow.
+	 * So we keep track of our cwnd separately, in v_beg_snd_cwnd.
+	 */
+
+	if (after(ack, tp->vegas.beg_snd_nxt)) {
+		/* Do the Vegas once-per-RTT cwnd adjustment. */
+		u32 old_wnd, old_snd_cwnd;
+
+		
+		/* Here old_wnd is essentially the window of data that was
+		 * sent during the previous RTT, and has all
+		 * been acknowledged in the course of the RTT that ended
+		 * with the ACK we just received. Likewise, old_snd_cwnd
+		 * is the cwnd during the previous RTT.
+		 */
+		old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) /
+			tp->mss_cache_std;
+		old_snd_cwnd = tp->vegas.beg_snd_cwnd;
+
+		/* Save the extent of the current window so we can use this
+		 * at the end of the next RTT.
+		 */
+		tp->vegas.beg_snd_una  = tp->vegas.beg_snd_nxt;
+		tp->vegas.beg_snd_nxt  = tp->snd_nxt;
+		tp->vegas.beg_snd_cwnd = tp->snd_cwnd;
+
+		/* Take into account the current RTT sample too, to
+		 * decrease the impact of delayed acks. This double counts
+		 * this sample since we count it for the next window as well,
+		 * but that's not too awful, since we're taking the min,
+		 * rather than averaging.
+		 */
+		vegas_rtt_calc(tp, seq_rtt);
+
+		/* We do the Vegas calculations only if we got enough RTT
+		 * samples that we can be reasonably sure that we got
+		 * at least one RTT sample that wasn't from a delayed ACK.
+		 * If we only had 2 samples total,
+		 * then that means we're getting only 1 ACK per RTT, which
+		 * means they're almost certainly delayed ACKs.
+		 * If  we have 3 samples, we should be OK.
+		 */
+
+		if (tp->vegas.cntRTT <= 2) {
+			/* We don't have enough RTT samples to do the Vegas
+			 * calculation, so we'll behave like Reno.
+			 */
+			if (tp->snd_cwnd > tp->snd_ssthresh)
+				tp->snd_cwnd++;
+		} else {
+			u32 rtt, target_cwnd, diff;
+
+			/* We have enough RTT samples, so, using the Vegas
+			 * algorithm, we determine if we should increase or
+			 * decrease cwnd, and by how much.
+			 */
+
+			/* Pluck out the RTT we are using for the Vegas
+			 * calculations. This is the min RTT seen during the
+			 * last RTT. Taking the min filters out the effects
+			 * of delayed ACKs, at the cost of noticing congestion
+			 * a bit later.
+			 */
+			rtt = tp->vegas.minRTT;
+
+			/* Calculate the cwnd we should have, if we weren't
+			 * going too fast.
+			 *
+			 * This is:
+			 *     (actual rate in segments) * baseRTT
+			 * We keep it as a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary point.
+			 */
+			target_cwnd = ((old_wnd * tp->vegas.baseRTT)
+				       << V_PARAM_SHIFT) / rtt;
+
+			/* Calculate the difference between the window we had,
+			 * and the window we would like to have. This quantity
+			 * is the "Diff" from the Arizona Vegas papers.
+			 *
+			 * Again, this is a fixed point number with
+			 * V_PARAM_SHIFT bits to the right of the binary
+			 * point.
+			 */
+			diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd;
+
+			if (tp->snd_cwnd < tp->snd_ssthresh) {
+				/* Slow start.  */
+				if (diff > sysctl_tcp_vegas_gamma) {
+					/* Going too fast. Time to slow down
+					 * and switch to congestion avoidance.
+					 */
+					tp->snd_ssthresh = 2;
+
+					/* Set cwnd to match the actual rate
+					 * exactly:
+					 *   cwnd = (actual rate) * baseRTT
+					 * Then we add 1 because the integer
+					 * truncation robs us of full link
+					 * utilization.
+					 */
+					tp->snd_cwnd = min(tp->snd_cwnd,
+							   (target_cwnd >>
+							    V_PARAM_SHIFT)+1);
+
+				}
+			} else {
+				/* Congestion avoidance. */
+				u32 next_snd_cwnd;
+
+				/* Figure out where we would like cwnd
+				 * to be.
+				 */
+				if (diff > sysctl_tcp_vegas_beta) {
+					/* The old window was too fast, so
+					 * we slow down.
+					 */
+					next_snd_cwnd = old_snd_cwnd - 1;
+				} else if (diff < sysctl_tcp_vegas_alpha) {
+					/* We don't have enough extra packets
+					 * in the network, so speed up.
+					 */
+					next_snd_cwnd = old_snd_cwnd + 1;
+				} else {
+					/* Sending just as fast as we
+					 * should be.
+					 */
+					next_snd_cwnd = old_snd_cwnd;
+				}
+
+				/* Adjust cwnd upward or downward, toward the
+				 * desired value.
+				 */
+				if (next_snd_cwnd > tp->snd_cwnd)
+					tp->snd_cwnd++;
+				else if (next_snd_cwnd < tp->snd_cwnd)
+					tp->snd_cwnd--;
+			}
+		}
+
+		/* Wipe the slate clean for the next RTT. */
+		tp->vegas.cntRTT = 0;
+		tp->vegas.minRTT = 0x7fffffff;
+	}
+
+	/* The following code is executed for every ack we receive,
+	 * except for conditions checked in should_advance_cwnd()
+	 * before the call to tcp_cong_avoid(). Mainly this means that
+	 * we only execute this code if the ack actually acked some
+	 * data.
+	 */
+
+	/* If we are in slow start, increase our cwnd in response to this ACK.
+	 * (If we are not in slow start then we are in congestion avoidance,
+	 * and adjust our congestion window only once per RTT. See the code
+	 * above.)
+	 */
+	if (tp->snd_cwnd <= tp->snd_ssthresh) 
+		tp->snd_cwnd++;
+
+	/* to keep cwnd from growing without bound */
+	tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp);
+
+	/* Make sure that we are never so timid as to reduce our cwnd below
+	 * 2 MSS.
+	 *
+	 * Going below 2 MSS would risk huge delayed ACKs from our receiver.
+	 */
+	tp->snd_cwnd = max(tp->snd_cwnd, 2U);
+
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt)
+{
+	if (tcp_vegas_enabled(tp))
+		vegas_cong_avoid(tp, ack, seq_rtt);
+	else
+		reno_cong_avoid(tp);
+}
+
+/* Restart timer after forward progress on connection.
+ * RFC2988 recommends to restart timer to now+rto.
+ */
+
+static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp)
+{
+	if (!tp->packets_out) {
+		tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS);
+	} else {
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+	}
+}
+
+/* There is one downside to this scheme.  Although we keep the
+ * ACK clock ticking, adjusting packet counters and advancing
+ * congestion window, we do not liberate socket send buffer
+ * space.
+ *
+ * Mucking with skb->truesize and sk->sk_wmem_alloc et al.
+ * then making a write space wakeup callback is a possible
+ * future enhancement.  WARNING: it is not trivial to make.
+ */
+static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
+			 __u32 now, __s32 *seq_rtt)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
+	__u32 seq = tp->snd_una;
+	__u32 packets_acked;
+	int acked = 0;
+
+	/* If we get here, the whole TSO packet has not been
+	 * acked.
+	 */
+	BUG_ON(!after(scb->end_seq, seq));
+
+	packets_acked = tcp_skb_pcount(skb);
+	if (tcp_trim_head(sk, skb, seq - scb->seq))
+		return 0;
+	packets_acked -= tcp_skb_pcount(skb);
+
+	if (packets_acked) {
+		__u8 sacked = scb->sacked;
+
+		acked |= FLAG_DATA_ACKED;
+		if (sacked) {
+			if (sacked & TCPCB_RETRANS) {
+				if (sacked & TCPCB_SACKED_RETRANS)
+					tp->retrans_out -= packets_acked;
+				acked |= FLAG_RETRANS_DATA_ACKED;
+				*seq_rtt = -1;
+			} else if (*seq_rtt < 0)
+				*seq_rtt = now - scb->when;
+			if (sacked & TCPCB_SACKED_ACKED)
+				tp->sacked_out -= packets_acked;
+			if (sacked & TCPCB_LOST)
+				tp->lost_out -= packets_acked;
+			if (sacked & TCPCB_URG) {
+				if (tp->urg_mode &&
+				    !before(seq, tp->snd_up))
+					tp->urg_mode = 0;
+			}
+		} else if (*seq_rtt < 0)
+			*seq_rtt = now - scb->when;
+
+		if (tp->fackets_out) {
+			__u32 dval = min(tp->fackets_out, packets_acked);
+			tp->fackets_out -= dval;
+		}
+		tp->packets_out -= packets_acked;
+
+		BUG_ON(tcp_skb_pcount(skb) == 0);
+		BUG_ON(!before(scb->seq, scb->end_seq));
+	}
+
+	return acked;
+}
+
+
+/* Remove acknowledged frames from the retransmission queue. */
+static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	__u32 now = tcp_time_stamp;
+	int acked = 0;
+	__s32 seq_rtt = -1;
+
+	while ((skb = skb_peek(&sk->sk_write_queue)) &&
+	       skb != sk->sk_send_head) {
+		struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 
+		__u8 sacked = scb->sacked;
+
+		/* If our packet is before the ack sequence we can
+		 * discard it as it's confirmed to have arrived at
+		 * the other end.
+		 */
+		if (after(scb->end_seq, tp->snd_una)) {
+			if (tcp_skb_pcount(skb) > 1)
+				acked |= tcp_tso_acked(sk, skb,
+						       now, &seq_rtt);
+			break;
+		}
+
+		/* Initial outgoing SYN's get put onto the write_queue
+		 * just like anything else we transmit.  It is not
+		 * true data, and if we misinform our callers that
+		 * this ACK acks real data, we will erroneously exit
+		 * connection startup slow start one packet too
+		 * quickly.  This is severely frowned upon behavior.
+		 */
+		if (!(scb->flags & TCPCB_FLAG_SYN)) {
+			acked |= FLAG_DATA_ACKED;
+		} else {
+			acked |= FLAG_SYN_ACKED;
+			tp->retrans_stamp = 0;
+		}
+
+		if (sacked) {
+			if (sacked & TCPCB_RETRANS) {
+				if(sacked & TCPCB_SACKED_RETRANS)
+					tp->retrans_out -= tcp_skb_pcount(skb);
+				acked |= FLAG_RETRANS_DATA_ACKED;
+				seq_rtt = -1;
+			} else if (seq_rtt < 0)
+				seq_rtt = now - scb->when;
+			if (sacked & TCPCB_SACKED_ACKED)
+				tp->sacked_out -= tcp_skb_pcount(skb);
+			if (sacked & TCPCB_LOST)
+				tp->lost_out -= tcp_skb_pcount(skb);
+			if (sacked & TCPCB_URG) {
+				if (tp->urg_mode &&
+				    !before(scb->end_seq, tp->snd_up))
+					tp->urg_mode = 0;
+			}
+		} else if (seq_rtt < 0)
+			seq_rtt = now - scb->when;
+		tcp_dec_pcount_approx(&tp->fackets_out, skb);
+		tcp_packets_out_dec(tp, skb);
+		__skb_unlink(skb, skb->list);
+		sk_stream_free_skb(sk, skb);
+	}
+
+	if (acked&FLAG_ACKED) {
+		tcp_ack_update_rtt(tp, acked, seq_rtt);
+		tcp_ack_packets_out(sk, tp);
+	}
+
+#if FASTRETRANS_DEBUG > 0
+	BUG_TRAP((int)tp->sacked_out >= 0);
+	BUG_TRAP((int)tp->lost_out >= 0);
+	BUG_TRAP((int)tp->retrans_out >= 0);
+	if (!tp->packets_out && tp->rx_opt.sack_ok) {
+		if (tp->lost_out) {
+			printk(KERN_DEBUG "Leak l=%u %d\n",
+			       tp->lost_out, tp->ca_state);
+			tp->lost_out = 0;
+		}
+		if (tp->sacked_out) {
+			printk(KERN_DEBUG "Leak s=%u %d\n",
+			       tp->sacked_out, tp->ca_state);
+			tp->sacked_out = 0;
+		}
+		if (tp->retrans_out) {
+			printk(KERN_DEBUG "Leak r=%u %d\n",
+			       tp->retrans_out, tp->ca_state);
+			tp->retrans_out = 0;
+		}
+	}
+#endif
+	*seq_rtt_p = seq_rtt;
+	return acked;
+}
+
+static void tcp_ack_probe(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Was it a usable window open? */
+
+	if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq,
+		   tp->snd_una + tp->snd_wnd)) {
+		tp->backoff = 0;
+		tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0);
+		/* Socket must be waked up by subsequent tcp_data_snd_check().
+		 * This function is not for random using!
+		 */
+	} else {
+		tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0,
+				     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+	}
+}
+
+static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag)
+{
+	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+		tp->ca_state != TCP_CA_Open);
+}
+
+static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag)
+{
+	return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) &&
+		!((1<<tp->ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR));
+}
+
+/* Check that window update is acceptable.
+ * The function assumes that snd_una<=ack<=snd_next.
+ */
+static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack,
+					u32 ack_seq, u32 nwin)
+{
+	return (after(ack, tp->snd_una) ||
+		after(ack_seq, tp->snd_wl1) ||
+		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+}
+
+/* Update our send window.
+ *
+ * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2
+ * and in FreeBSD. NetBSD's one is even worse.) is wrong.
+ */
+static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp,
+				 struct sk_buff *skb, u32 ack, u32 ack_seq)
+{
+	int flag = 0;
+	u32 nwin = ntohs(skb->h.th->window);
+
+	if (likely(!skb->h.th->syn))
+		nwin <<= tp->rx_opt.snd_wscale;
+
+	if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
+		flag |= FLAG_WIN_UPDATE;
+		tcp_update_wl(tp, ack, ack_seq);
+
+		if (tp->snd_wnd != nwin) {
+			tp->snd_wnd = nwin;
+
+			/* Note, it is the only place, where
+			 * fast path is recovered for sending TCP.
+			 */
+			tcp_fast_path_check(sk, tp);
+
+			if (nwin > tp->max_window) {
+				tp->max_window = nwin;
+				tcp_sync_mss(sk, tp->pmtu_cookie);
+			}
+		}
+	}
+
+	tp->snd_una = ack;
+
+	return flag;
+}
+
+static void tcp_process_frto(struct sock *sk, u32 prior_snd_una)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	
+	tcp_sync_left_out(tp);
+	
+	if (tp->snd_una == prior_snd_una ||
+	    !before(tp->snd_una, tp->frto_highmark)) {
+		/* RTO was caused by loss, start retransmitting in
+		 * go-back-N slow start
+		 */
+		tcp_enter_frto_loss(sk);
+		return;
+	}
+
+	if (tp->frto_counter == 1) {
+		/* First ACK after RTO advances the window: allow two new
+		 * segments out.
+		 */
+		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
+	} else {
+		/* Also the second ACK after RTO advances the window.
+		 * The RTO was likely spurious. Reduce cwnd and continue
+		 * in congestion avoidance
+		 */
+		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+		tcp_moderate_cwnd(tp);
+	}
+
+	/* F-RTO affects on two new ACKs following RTO.
+	 * At latest on third ACK the TCP behavor is back to normal.
+	 */
+	tp->frto_counter = (tp->frto_counter + 1) % 3;
+}
+
+/*
+ * TCP Westwood+
+ */
+
+/*
+ * @init_westwood
+ * This function initializes fields used in TCP Westwood+. We can't
+ * get no information about RTTmin at this time so we simply set it to
+ * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative
+ * since in this way we're sure it will be updated in a consistent
+ * way as soon as possible. It will reasonably happen within the first
+ * RTT period of the connection lifetime.
+ */
+
+static void init_westwood(struct sock *sk)
+{
+        struct tcp_sock *tp = tcp_sk(sk);
+
+        tp->westwood.bw_ns_est = 0;
+        tp->westwood.bw_est = 0;
+        tp->westwood.accounted = 0;
+        tp->westwood.cumul_ack = 0;
+        tp->westwood.rtt_win_sx = tcp_time_stamp;
+        tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT;
+        tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT;
+        tp->westwood.snd_una = tp->snd_una;
+}
+
+/*
+ * @westwood_do_filter
+ * Low-pass filter. Implemented using constant coeffients.
+ */
+
+static inline __u32 westwood_do_filter(__u32 a, __u32 b)
+{
+	return (((7 * a) + b) >> 3);
+}
+
+static void westwood_filter(struct sock *sk, __u32 delta)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->westwood.bw_ns_est =
+		westwood_do_filter(tp->westwood.bw_ns_est, 
+				   tp->westwood.bk / delta);
+	tp->westwood.bw_est =
+		westwood_do_filter(tp->westwood.bw_est,
+				   tp->westwood.bw_ns_est);
+}
+
+/* 
+ * @westwood_update_rttmin
+ * It is used to update RTTmin. In this case we MUST NOT use
+ * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN!
+ */
+
+static inline __u32 westwood_update_rttmin(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	__u32 rttmin = tp->westwood.rtt_min;
+
+	if (tp->westwood.rtt != 0 &&
+	    (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin))
+		rttmin = tp->westwood.rtt;
+
+	return rttmin;
+}
+
+/*
+ * @westwood_acked
+ * Evaluate increases for dk. 
+ */
+
+static inline __u32 westwood_acked(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+
+	return tp->snd_una - tp->westwood.snd_una;
+}
+
+/*
+ * @westwood_new_window
+ * It evaluates if we are receiving data inside the same RTT window as
+ * when we started.
+ * Return value:
+ * It returns 0 if we are still evaluating samples in the same RTT
+ * window, 1 if the sample has to be considered in the next window.
+ */
+
+static int westwood_new_window(const struct sock *sk)
+{
+	const struct tcp_sock *tp = tcp_sk(sk);
+	__u32 left_bound;
+	__u32 rtt;
+	int ret = 0;
+
+	left_bound = tp->westwood.rtt_win_sx;
+	rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN);
+
+	/*
+	 * A RTT-window has passed. Be careful since if RTT is less than
+	 * 50ms we don't filter but we continue 'building the sample'.
+	 * This minimum limit was choosen since an estimation on small
+	 * time intervals is better to avoid...
+	 * Obvioulsy on a LAN we reasonably will always have
+	 * right_bound = left_bound + WESTWOOD_RTT_MIN
+         */
+
+	if ((left_bound + rtt) < tcp_time_stamp)
+		ret = 1;
+
+	return ret;
+}
+
+/*
+ * @westwood_update_window
+ * It updates RTT evaluation window if it is the right moment to do
+ * it. If so it calls filter for evaluating bandwidth. 
+ */
+
+static void __westwood_update_window(struct sock *sk, __u32 now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 delta = now - tp->westwood.rtt_win_sx;
+
+        if (delta) {
+		if (tp->westwood.rtt)
+			westwood_filter(sk, delta);
+
+		tp->westwood.bk = 0;
+		tp->westwood.rtt_win_sx = tcp_time_stamp;
+	}
+}
+
+
+static void westwood_update_window(struct sock *sk, __u32 now)
+{
+	if (westwood_new_window(sk)) 
+		__westwood_update_window(sk, now);
+}
+
+/*
+ * @__tcp_westwood_fast_bw
+ * It is called when we are in fast path. In particular it is called when
+ * header prediction is successfull. In such case infact update is
+ * straight forward and doesn't need any particular care.
+ */
+
+static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	westwood_update_window(sk, tcp_time_stamp);
+
+	tp->westwood.bk += westwood_acked(sk);
+	tp->westwood.snd_una = tp->snd_una;
+	tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+
+static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb)
+{
+        if (tcp_is_westwood(tcp_sk(sk)))
+                __tcp_westwood_fast_bw(sk, skb);
+}
+
+
+/*
+ * @westwood_dupack_update
+ * It updates accounted and cumul_ack when receiving a dupack.
+ */
+
+static void westwood_dupack_update(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->westwood.accounted += tp->mss_cache_std;
+	tp->westwood.cumul_ack = tp->mss_cache_std;
+}
+
+static inline int westwood_may_change_cumul(struct tcp_sock *tp)
+{
+	return (tp->westwood.cumul_ack > tp->mss_cache_std);
+}
+
+static inline void westwood_partial_update(struct tcp_sock *tp)
+{
+	tp->westwood.accounted -= tp->westwood.cumul_ack;
+	tp->westwood.cumul_ack = tp->mss_cache_std;
+}
+
+static inline void westwood_complete_update(struct tcp_sock *tp)
+{
+	tp->westwood.cumul_ack -= tp->westwood.accounted;
+	tp->westwood.accounted = 0;
+}
+
+/*
+ * @westwood_acked_count
+ * This function evaluates cumul_ack for evaluating dk in case of
+ * delayed or partial acks.
+ */
+
+static inline __u32 westwood_acked_count(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->westwood.cumul_ack = westwood_acked(sk);
+
+        /* If cumul_ack is 0 this is a dupack since it's not moving
+         * tp->snd_una.
+         */
+        if (!(tp->westwood.cumul_ack))
+                westwood_dupack_update(sk);
+
+        if (westwood_may_change_cumul(tp)) {
+		/* Partial or delayed ack */
+		if (tp->westwood.accounted >= tp->westwood.cumul_ack)
+			westwood_partial_update(tp);
+		else
+			westwood_complete_update(tp);
+	}
+
+	tp->westwood.snd_una = tp->snd_una;
+
+	return tp->westwood.cumul_ack;
+}
+
+
+/*
+ * @__tcp_westwood_slow_bw
+ * It is called when something is going wrong..even if there could
+ * be no problems! Infact a simple delayed packet may trigger a
+ * dupack. But we need to be careful in such case.
+ */
+
+static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	westwood_update_window(sk, tcp_time_stamp);
+
+	tp->westwood.bk += westwood_acked_count(sk);
+	tp->westwood.rtt_min = westwood_update_rttmin(sk);
+}
+
+static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb)
+{
+        if (tcp_is_westwood(tcp_sk(sk)))
+                __tcp_westwood_slow_bw(sk, skb);
+}
+
+/* This routine deals with incoming acks, but not outgoing ones. */
+static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 prior_snd_una = tp->snd_una;
+	u32 ack_seq = TCP_SKB_CB(skb)->seq;
+	u32 ack = TCP_SKB_CB(skb)->ack_seq;
+	u32 prior_in_flight;
+	s32 seq_rtt;
+	int prior_packets;
+
+	/* If the ack is newer than sent or older than previous acks
+	 * then we can probably ignore it.
+	 */
+	if (after(ack, tp->snd_nxt))
+		goto uninteresting_ack;
+
+	if (before(ack, prior_snd_una))
+		goto old_ack;
+
+	if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
+		/* Window is constant, pure forward advance.
+		 * No more checks are required.
+		 * Note, we use the fact that SND.UNA>=SND.WL2.
+		 */
+		tcp_update_wl(tp, ack, ack_seq);
+		tp->snd_una = ack;
+		tcp_westwood_fast_bw(sk, skb);
+		flag |= FLAG_WIN_UPDATE;
+
+		NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS);
+	} else {
+		if (ack_seq != TCP_SKB_CB(skb)->end_seq)
+			flag |= FLAG_DATA;
+		else
+			NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS);
+
+		flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
+
+		if (TCP_SKB_CB(skb)->sacked)
+			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+
+		if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th))
+			flag |= FLAG_ECE;
+
+		tcp_westwood_slow_bw(sk,skb);
+	}
+
+	/* We passed data and got it acked, remove any soft error
+	 * log. Something worked...
+	 */
+	sk->sk_err_soft = 0;
+	tp->rcv_tstamp = tcp_time_stamp;
+	prior_packets = tp->packets_out;
+	if (!prior_packets)
+		goto no_queue;
+
+	prior_in_flight = tcp_packets_in_flight(tp);
+
+	/* See if we can take anything off of the retransmit queue. */
+	flag |= tcp_clean_rtx_queue(sk, &seq_rtt);
+
+	if (tp->frto_counter)
+		tcp_process_frto(sk, prior_snd_una);
+
+	if (tcp_ack_is_dubious(tp, flag)) {
+		/* Advanve CWND, if state allows this. */
+		if ((flag & FLAG_DATA_ACKED) &&
+		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) &&
+		    tcp_may_raise_cwnd(tp, flag))
+			tcp_cong_avoid(tp, ack, seq_rtt);
+		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
+	} else {
+		if ((flag & FLAG_DATA_ACKED) && 
+		    (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd))
+			tcp_cong_avoid(tp, ack, seq_rtt);
+	}
+
+	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP))
+		dst_confirm(sk->sk_dst_cache);
+
+	return 1;
+
+no_queue:
+	tp->probes_out = 0;
+
+	/* If this ack opens up a zero window, clear backoff.  It was
+	 * being used to time the probes, and is probably far higher than
+	 * it needs to be for normal retransmission.
+	 */
+	if (sk->sk_send_head)
+		tcp_ack_probe(sk);
+	return 1;
+
+old_ack:
+	if (TCP_SKB_CB(skb)->sacked)
+		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+
+uninteresting_ack:
+	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
+	return 0;
+}
+
+
+/* Look for tcp options. Normally only called on SYN and SYNACK packets.
+ * But, this can also be called on packets in the established flow when
+ * the fast version below fails.
+ */
+void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab)
+{
+	unsigned char *ptr;
+	struct tcphdr *th = skb->h.th;
+	int length=(th->doff*4)-sizeof(struct tcphdr);
+
+	ptr = (unsigned char *)(th + 1);
+	opt_rx->saw_tstamp = 0;
+
+	while(length>0) {
+	  	int opcode=*ptr++;
+		int opsize;
+
+		switch (opcode) {
+			case TCPOPT_EOL:
+				return;
+			case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
+				length--;
+				continue;
+			default:
+				opsize=*ptr++;
+				if (opsize < 2) /* "silly options" */
+					return;
+				if (opsize > length)
+					return;	/* don't parse partial options */
+	  			switch(opcode) {
+				case TCPOPT_MSS:
+					if(opsize==TCPOLEN_MSS && th->syn && !estab) {
+						u16 in_mss = ntohs(get_unaligned((__u16 *)ptr));
+						if (in_mss) {
+							if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
+								in_mss = opt_rx->user_mss;
+							opt_rx->mss_clamp = in_mss;
+						}
+					}
+					break;
+				case TCPOPT_WINDOW:
+					if(opsize==TCPOLEN_WINDOW && th->syn && !estab)
+						if (sysctl_tcp_window_scaling) {
+							__u8 snd_wscale = *(__u8 *) ptr;
+							opt_rx->wscale_ok = 1;
+							if (snd_wscale > 14) {
+								if(net_ratelimit())
+									printk(KERN_INFO "tcp_parse_options: Illegal window "
+									       "scaling value %d >14 received.\n",
+									       snd_wscale);
+								snd_wscale = 14;
+							}
+							opt_rx->snd_wscale = snd_wscale;
+						}
+					break;
+				case TCPOPT_TIMESTAMP:
+					if(opsize==TCPOLEN_TIMESTAMP) {
+						if ((estab && opt_rx->tstamp_ok) ||
+						    (!estab && sysctl_tcp_timestamps)) {
+							opt_rx->saw_tstamp = 1;
+							opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr));
+							opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4)));
+						}
+					}
+					break;
+				case TCPOPT_SACK_PERM:
+					if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) {
+						if (sysctl_tcp_sack) {
+							opt_rx->sack_ok = 1;
+							tcp_sack_reset(opt_rx);
+						}
+					}
+					break;
+
+				case TCPOPT_SACK:
+					if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) &&
+					   !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) &&
+					   opt_rx->sack_ok) {
+						TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
+					}
+	  			};
+	  			ptr+=opsize-2;
+	  			length-=opsize;
+	  	};
+	}
+}
+
+/* Fast parse options. This hopes to only see timestamps.
+ * If it is wrong it falls back on tcp_parse_options().
+ */
+static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
+					 struct tcp_sock *tp)
+{
+	if (th->doff == sizeof(struct tcphdr)>>2) {
+		tp->rx_opt.saw_tstamp = 0;
+		return 0;
+	} else if (tp->rx_opt.tstamp_ok &&
+		   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
+		__u32 *ptr = (__u32 *)(th + 1);
+		if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+				  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
+			tp->rx_opt.saw_tstamp = 1;
+			++ptr;
+			tp->rx_opt.rcv_tsval = ntohl(*ptr);
+			++ptr;
+			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+			return 1;
+		}
+	}
+	tcp_parse_options(skb, &tp->rx_opt, 1);
+	return 1;
+}
+
+static inline void tcp_store_ts_recent(struct tcp_sock *tp)
+{
+	tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
+	tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+}
+
+static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
+{
+	if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) {
+		/* PAWS bug workaround wrt. ACK frames, the PAWS discard
+		 * extra check below makes sure this can only happen
+		 * for pure ACK frames.  -DaveM
+		 *
+		 * Not only, also it occurs for expired timestamps.
+		 */
+
+		if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
+		   xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+			tcp_store_ts_recent(tp);
+	}
+}
+
+/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM
+ *
+ * It is not fatal. If this ACK does _not_ change critical state (seqs, window)
+ * it can pass through stack. So, the following predicate verifies that
+ * this segment is not used for anything but congestion avoidance or
+ * fast retransmit. Moreover, we even are able to eliminate most of such
+ * second order effects, if we apply some small "replay" window (~RTO)
+ * to timestamp space.
+ *
+ * All these measures still do not guarantee that we reject wrapped ACKs
+ * on networks with high bandwidth, when sequence space is recycled fastly,
+ * but it guarantees that such events will be very rare and do not affect
+ * connection seriously. This doesn't look nice, but alas, PAWS is really
+ * buggy extension.
+ *
+ * [ Later note. Even worse! It is buggy for segments _with_ data. RFC
+ * states that events when retransmit arrives after original data are rare.
+ * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is
+ * the biggest problem on large power networks even with minor reordering.
+ * OK, let's give it small replay window. If peer clock is even 1hz, it is safe
+ * up to bandwidth of 18Gigabit/sec. 8) ]
+ */
+
+static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	u32 seq = TCP_SKB_CB(skb)->seq;
+	u32 ack = TCP_SKB_CB(skb)->ack_seq;
+
+	return (/* 1. Pure ACK with correct sequence number. */
+		(th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+
+		/* 2. ... and duplicate ACK. */
+		ack == tp->snd_una &&
+
+		/* 3. ... and does not update window. */
+		!tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+
+		/* 4. ... and sits in replay window. */
+		(s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ);
+}
+
+static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb)
+{
+	return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
+		xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
+		!tcp_disordered_ack(tp, skb));
+}
+
+/* Check segment sequence number for validity.
+ *
+ * Segment controls are considered valid, if the segment
+ * fits to the window after truncation to the window. Acceptability
+ * of data (and SYN, FIN, of course) is checked separately.
+ * See tcp_data_queue(), for example.
+ *
+ * Also, controls (RST is main one) are accepted using RCV.WUP instead
+ * of RCV.NXT. Peer still did not advance his SND.UNA when we
+ * delayed ACK, so that hisSND.UNA<=ourRCV.WUP.
+ * (borrowed from freebsd)
+ */
+
+static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+	return	!before(end_seq, tp->rcv_wup) &&
+		!after(seq, tp->rcv_nxt + tcp_receive_window(tp));
+}
+
+/* When we get a reset we do this. */
+static void tcp_reset(struct sock *sk)
+{
+	/* We want the right error as BSD sees it (and indeed as we do). */
+	switch (sk->sk_state) {
+		case TCP_SYN_SENT:
+			sk->sk_err = ECONNREFUSED;
+			break;
+		case TCP_CLOSE_WAIT:
+			sk->sk_err = EPIPE;
+			break;
+		case TCP_CLOSE:
+			return;
+		default:
+			sk->sk_err = ECONNRESET;
+	}
+
+	if (!sock_flag(sk, SOCK_DEAD))
+		sk->sk_error_report(sk);
+
+	tcp_done(sk);
+}
+
+/*
+ * 	Process the FIN bit. This now behaves as it is supposed to work
+ *	and the FIN takes effect when it is validly part of sequence
+ *	space. Not before when we get holes.
+ *
+ *	If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT
+ *	(and thence onto LAST-ACK and finally, CLOSE, we never enter
+ *	TIME-WAIT)
+ *
+ *	If we are in FINWAIT-1, a received FIN indicates simultaneous
+ *	close and we go into CLOSING (and later onto TIME-WAIT)
+ *
+ *	If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
+ */
+static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_schedule_ack(tp);
+
+	sk->sk_shutdown |= RCV_SHUTDOWN;
+	sock_set_flag(sk, SOCK_DONE);
+
+	switch (sk->sk_state) {
+		case TCP_SYN_RECV:
+		case TCP_ESTABLISHED:
+			/* Move to CLOSE_WAIT */
+			tcp_set_state(sk, TCP_CLOSE_WAIT);
+			tp->ack.pingpong = 1;
+			break;
+
+		case TCP_CLOSE_WAIT:
+		case TCP_CLOSING:
+			/* Received a retransmission of the FIN, do
+			 * nothing.
+			 */
+			break;
+		case TCP_LAST_ACK:
+			/* RFC793: Remain in the LAST-ACK state. */
+			break;
+
+		case TCP_FIN_WAIT1:
+			/* This case occurs when a simultaneous close
+			 * happens, we must ack the received FIN and
+			 * enter the CLOSING state.
+			 */
+			tcp_send_ack(sk);
+			tcp_set_state(sk, TCP_CLOSING);
+			break;
+		case TCP_FIN_WAIT2:
+			/* Received a FIN -- send ACK and enter TIME_WAIT. */
+			tcp_send_ack(sk);
+			tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+			break;
+		default:
+			/* Only TCP_LISTEN and TCP_CLOSE are left, in these
+			 * cases we should never reach this piece of code.
+			 */
+			printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
+			       __FUNCTION__, sk->sk_state);
+			break;
+	};
+
+	/* It _is_ possible, that we have something out-of-order _after_ FIN.
+	 * Probably, we should reset in this case. For now drop them.
+	 */
+	__skb_queue_purge(&tp->out_of_order_queue);
+	if (tp->rx_opt.sack_ok)
+		tcp_sack_reset(&tp->rx_opt);
+	sk_stream_mem_reclaim(sk);
+
+	if (!sock_flag(sk, SOCK_DEAD)) {
+		sk->sk_state_change(sk);
+
+		/* Do not send POLL_HUP for half duplex close. */
+		if (sk->sk_shutdown == SHUTDOWN_MASK ||
+		    sk->sk_state == TCP_CLOSE)
+			sk_wake_async(sk, 1, POLL_HUP);
+		else
+			sk_wake_async(sk, 1, POLL_IN);
+	}
+}
+
+static __inline__ int
+tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq)
+{
+	if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) {
+		if (before(seq, sp->start_seq))
+			sp->start_seq = seq;
+		if (after(end_seq, sp->end_seq))
+			sp->end_seq = end_seq;
+		return 1;
+	}
+	return 0;
+}
+
+static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+	if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+		if (before(seq, tp->rcv_nxt))
+			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT);
+		else
+			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT);
+
+		tp->rx_opt.dsack = 1;
+		tp->duplicate_sack[0].start_seq = seq;
+		tp->duplicate_sack[0].end_seq = end_seq;
+		tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok);
+	}
+}
+
+static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq)
+{
+	if (!tp->rx_opt.dsack)
+		tcp_dsack_set(tp, seq, end_seq);
+	else
+		tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
+}
+
+static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+	    before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
+		tcp_enter_quickack_mode(tp);
+
+		if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) {
+			u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+			if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt))
+				end_seq = tp->rcv_nxt;
+			tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq);
+		}
+	}
+
+	tcp_send_ack(sk);
+}
+
+/* These routines update the SACK block as out-of-order packets arrive or
+ * in-order packets close up the sequence space.
+ */
+static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
+{
+	int this_sack;
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	struct tcp_sack_block *swalk = sp+1;
+
+	/* See if the recent change to the first SACK eats into
+	 * or hits the sequence space of other SACK blocks, if so coalesce.
+	 */
+	for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) {
+		if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) {
+			int i;
+
+			/* Zap SWALK, by moving every further SACK up by one slot.
+			 * Decrease num_sacks.
+			 */
+			tp->rx_opt.num_sacks--;
+			tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+			for(i=this_sack; i < tp->rx_opt.num_sacks; i++)
+				sp[i] = sp[i+1];
+			continue;
+		}
+		this_sack++, swalk++;
+	}
+}
+
+static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2)
+{
+	__u32 tmp;
+
+	tmp = sack1->start_seq;
+	sack1->start_seq = sack2->start_seq;
+	sack2->start_seq = tmp;
+
+	tmp = sack1->end_seq;
+	sack1->end_seq = sack2->end_seq;
+	sack2->end_seq = tmp;
+}
+
+static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	int cur_sacks = tp->rx_opt.num_sacks;
+	int this_sack;
+
+	if (!cur_sacks)
+		goto new_sack;
+
+	for (this_sack=0; this_sack<cur_sacks; this_sack++, sp++) {
+		if (tcp_sack_extend(sp, seq, end_seq)) {
+			/* Rotate this_sack to the first one. */
+			for (; this_sack>0; this_sack--, sp--)
+				tcp_sack_swap(sp, sp-1);
+			if (cur_sacks > 1)
+				tcp_sack_maybe_coalesce(tp);
+			return;
+		}
+	}
+
+	/* Could not find an adjacent existing SACK, build a new one,
+	 * put it at the front, and shift everyone else down.  We
+	 * always know there is at least one SACK present already here.
+	 *
+	 * If the sack array is full, forget about the last one.
+	 */
+	if (this_sack >= 4) {
+		this_sack--;
+		tp->rx_opt.num_sacks--;
+		sp--;
+	}
+	for(; this_sack > 0; this_sack--, sp--)
+		*sp = *(sp-1);
+
+new_sack:
+	/* Build the new head SACK, and we're done. */
+	sp->start_seq = seq;
+	sp->end_seq = end_seq;
+	tp->rx_opt.num_sacks++;
+	tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+}
+
+/* RCV.NXT advances, some SACKs should be eaten. */
+
+static void tcp_sack_remove(struct tcp_sock *tp)
+{
+	struct tcp_sack_block *sp = &tp->selective_acks[0];
+	int num_sacks = tp->rx_opt.num_sacks;
+	int this_sack;
+
+	/* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
+	if (skb_queue_len(&tp->out_of_order_queue) == 0) {
+		tp->rx_opt.num_sacks = 0;
+		tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
+		return;
+	}
+
+	for(this_sack = 0; this_sack < num_sacks; ) {
+		/* Check if the start of the sack is covered by RCV.NXT. */
+		if (!before(tp->rcv_nxt, sp->start_seq)) {
+			int i;
+
+			/* RCV.NXT must cover all the block! */
+			BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq));
+
+			/* Zap this SACK, by moving forward any other SACKS. */
+			for (i=this_sack+1; i < num_sacks; i++)
+				tp->selective_acks[i-1] = tp->selective_acks[i];
+			num_sacks--;
+			continue;
+		}
+		this_sack++;
+		sp++;
+	}
+	if (num_sacks != tp->rx_opt.num_sacks) {
+		tp->rx_opt.num_sacks = num_sacks;
+		tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok);
+	}
+}
+
+/* This one checks to see if we can put data from the
+ * out_of_order queue into the receive_queue.
+ */
+static void tcp_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 dsack_high = tp->rcv_nxt;
+	struct sk_buff *skb;
+
+	while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) {
+		if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+			break;
+
+		if (before(TCP_SKB_CB(skb)->seq, dsack_high)) {
+			__u32 dsack = dsack_high;
+			if (before(TCP_SKB_CB(skb)->end_seq, dsack_high))
+				dsack_high = TCP_SKB_CB(skb)->end_seq;
+			tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack);
+		}
+
+		if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+			SOCK_DEBUG(sk, "ofo packet was already received \n");
+			__skb_unlink(skb, skb->list);
+			__kfree_skb(skb);
+			continue;
+		}
+		SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n",
+			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(skb)->end_seq);
+
+		__skb_unlink(skb, skb->list);
+		__skb_queue_tail(&sk->sk_receive_queue, skb);
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if(skb->h.th->fin)
+			tcp_fin(skb, sk, skb->h.th);
+	}
+}
+
+static int tcp_prune_queue(struct sock *sk);
+
+static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int eaten = -1;
+
+	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq)
+		goto drop;
+
+	th = skb->h.th;
+	__skb_pull(skb, th->doff*4);
+
+	TCP_ECN_accept_cwr(tp, skb);
+
+	if (tp->rx_opt.dsack) {
+		tp->rx_opt.dsack = 0;
+		tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks,
+						    4 - tp->rx_opt.tstamp_ok);
+	}
+
+	/*  Queue data for delivery to the user.
+	 *  Packets in sequence go to the receive queue.
+	 *  Out of sequence packets to the out_of_order_queue.
+	 */
+	if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		if (tcp_receive_window(tp) == 0)
+			goto out_of_window;
+
+		/* Ok. In sequence. In window. */
+		if (tp->ucopy.task == current &&
+		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
+		    sock_owned_by_user(sk) && !tp->urg_data) {
+			int chunk = min_t(unsigned int, skb->len,
+							tp->ucopy.len);
+
+			__set_current_state(TASK_RUNNING);
+
+			local_bh_enable();
+			if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) {
+				tp->ucopy.len -= chunk;
+				tp->copied_seq += chunk;
+				eaten = (chunk == skb->len && !th->fin);
+				tcp_rcv_space_adjust(sk);
+			}
+			local_bh_disable();
+		}
+
+		if (eaten <= 0) {
+queue_and_out:
+			if (eaten < 0 &&
+			    (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+			     !sk_stream_rmem_schedule(sk, skb))) {
+				if (tcp_prune_queue(sk) < 0 ||
+				    !sk_stream_rmem_schedule(sk, skb))
+					goto drop;
+			}
+			sk_stream_set_owner_r(skb, sk);
+			__skb_queue_tail(&sk->sk_receive_queue, skb);
+		}
+		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+		if(skb->len)
+			tcp_event_data_recv(sk, tp, skb);
+		if(th->fin)
+			tcp_fin(skb, sk, th);
+
+		if (skb_queue_len(&tp->out_of_order_queue)) {
+			tcp_ofo_queue(sk);
+
+			/* RFC2581. 4.2. SHOULD send immediate ACK, when
+			 * gap in queue is filled.
+			 */
+			if (!skb_queue_len(&tp->out_of_order_queue))
+				tp->ack.pingpong = 0;
+		}
+
+		if (tp->rx_opt.num_sacks)
+			tcp_sack_remove(tp);
+
+		tcp_fast_path_check(sk, tp);
+
+		if (eaten > 0)
+			__kfree_skb(skb);
+		else if (!sock_flag(sk, SOCK_DEAD))
+			sk->sk_data_ready(sk, 0);
+		return;
+	}
+
+	if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) {
+		/* A retransmit, 2nd most common case.  Force an immediate ack. */
+		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST);
+		tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+out_of_window:
+		tcp_enter_quickack_mode(tp);
+		tcp_schedule_ack(tp);
+drop:
+		__kfree_skb(skb);
+		return;
+	}
+
+	/* Out of window. F.e. zero window probe. */
+	if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
+		goto out_of_window;
+
+	tcp_enter_quickack_mode(tp);
+
+	if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		/* Partial packet, seq < rcv_next < end_seq */
+		SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
+			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
+			   TCP_SKB_CB(skb)->end_seq);
+
+		tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
+		
+		/* If window is closed, drop tail of packet. But after
+		 * remembering D-SACK for its head made in previous line.
+		 */
+		if (!tcp_receive_window(tp))
+			goto out_of_window;
+		goto queue_and_out;
+	}
+
+	TCP_ECN_check_ce(tp, skb);
+
+	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+	    !sk_stream_rmem_schedule(sk, skb)) {
+		if (tcp_prune_queue(sk) < 0 ||
+		    !sk_stream_rmem_schedule(sk, skb))
+			goto drop;
+	}
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
+	tcp_schedule_ack(tp);
+
+	SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n",
+		   tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
+
+	sk_stream_set_owner_r(skb, sk);
+
+	if (!skb_peek(&tp->out_of_order_queue)) {
+		/* Initial out of order segment, build 1 SACK. */
+		if (tp->rx_opt.sack_ok) {
+			tp->rx_opt.num_sacks = 1;
+			tp->rx_opt.dsack     = 0;
+			tp->rx_opt.eff_sacks = 1;
+			tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
+			tp->selective_acks[0].end_seq =
+						TCP_SKB_CB(skb)->end_seq;
+		}
+		__skb_queue_head(&tp->out_of_order_queue,skb);
+	} else {
+		struct sk_buff *skb1 = tp->out_of_order_queue.prev;
+		u32 seq = TCP_SKB_CB(skb)->seq;
+		u32 end_seq = TCP_SKB_CB(skb)->end_seq;
+
+		if (seq == TCP_SKB_CB(skb1)->end_seq) {
+			__skb_append(skb1, skb);
+
+			if (!tp->rx_opt.num_sacks ||
+			    tp->selective_acks[0].end_seq != seq)
+				goto add_sack;
+
+			/* Common case: data arrive in order after hole. */
+			tp->selective_acks[0].end_seq = end_seq;
+			return;
+		}
+
+		/* Find place to insert this segment. */
+		do {
+			if (!after(TCP_SKB_CB(skb1)->seq, seq))
+				break;
+		} while ((skb1 = skb1->prev) !=
+			 (struct sk_buff*)&tp->out_of_order_queue);
+
+		/* Do skb overlap to previous one? */
+		if (skb1 != (struct sk_buff*)&tp->out_of_order_queue &&
+		    before(seq, TCP_SKB_CB(skb1)->end_seq)) {
+			if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+				/* All the bits are present. Drop. */
+				__kfree_skb(skb);
+				tcp_dsack_set(tp, seq, end_seq);
+				goto add_sack;
+			}
+			if (after(seq, TCP_SKB_CB(skb1)->seq)) {
+				/* Partial overlap. */
+				tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq);
+			} else {
+				skb1 = skb1->prev;
+			}
+		}
+		__skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue);
+		
+		/* And clean segments covered by new one as whole. */
+		while ((skb1 = skb->next) !=
+		       (struct sk_buff*)&tp->out_of_order_queue &&
+		       after(end_seq, TCP_SKB_CB(skb1)->seq)) {
+		       if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) {
+			       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq);
+			       break;
+		       }
+		       __skb_unlink(skb1, skb1->list);
+		       tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq);
+		       __kfree_skb(skb1);
+		}
+
+add_sack:
+		if (tp->rx_opt.sack_ok)
+			tcp_sack_new_ofo_skb(sk, seq, end_seq);
+	}
+}
+
+/* Collapse contiguous sequence of skbs head..tail with
+ * sequence numbers start..end.
+ * Segments with FIN/SYN are not collapsed (only because this
+ * simplifies code)
+ */
+static void
+tcp_collapse(struct sock *sk, struct sk_buff *head,
+	     struct sk_buff *tail, u32 start, u32 end)
+{
+	struct sk_buff *skb;
+
+	/* First, check that queue is collapsable and find
+	 * the point where collapsing can be useful. */
+	for (skb = head; skb != tail; ) {
+		/* No new bits? It is possible on ofo queue. */
+		if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+			struct sk_buff *next = skb->next;
+			__skb_unlink(skb, skb->list);
+			__kfree_skb(skb);
+			NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
+			skb = next;
+			continue;
+		}
+
+		/* The first skb to collapse is:
+		 * - not SYN/FIN and
+		 * - bloated or contains data before "start" or
+		 *   overlaps to the next one.
+		 */
+		if (!skb->h.th->syn && !skb->h.th->fin &&
+		    (tcp_win_from_space(skb->truesize) > skb->len ||
+		     before(TCP_SKB_CB(skb)->seq, start) ||
+		     (skb->next != tail &&
+		      TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq)))
+			break;
+
+		/* Decided to skip this, advance start seq. */
+		start = TCP_SKB_CB(skb)->end_seq;
+		skb = skb->next;
+	}
+	if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+		return;
+
+	while (before(start, end)) {
+		struct sk_buff *nskb;
+		int header = skb_headroom(skb);
+		int copy = SKB_MAX_ORDER(header, 0);
+
+		/* Too big header? This can happen with IPv6. */
+		if (copy < 0)
+			return;
+		if (end-start < copy)
+			copy = end-start;
+		nskb = alloc_skb(copy+header, GFP_ATOMIC);
+		if (!nskb)
+			return;
+		skb_reserve(nskb, header);
+		memcpy(nskb->head, skb->head, header);
+		nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head);
+		nskb->h.raw = nskb->head + (skb->h.raw-skb->head);
+		nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head);
+		memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+		TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
+		__skb_insert(nskb, skb->prev, skb, skb->list);
+		sk_stream_set_owner_r(nskb, sk);
+
+		/* Copy data, releasing collapsed skbs. */
+		while (copy > 0) {
+			int offset = start - TCP_SKB_CB(skb)->seq;
+			int size = TCP_SKB_CB(skb)->end_seq - start;
+
+			if (offset < 0) BUG();
+			if (size > 0) {
+				size = min(copy, size);
+				if (skb_copy_bits(skb, offset, skb_put(nskb, size), size))
+					BUG();
+				TCP_SKB_CB(nskb)->end_seq += size;
+				copy -= size;
+				start += size;
+			}
+			if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
+				struct sk_buff *next = skb->next;
+				__skb_unlink(skb, skb->list);
+				__kfree_skb(skb);
+				NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED);
+				skb = next;
+				if (skb == tail || skb->h.th->syn || skb->h.th->fin)
+					return;
+			}
+		}
+	}
+}
+
+/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs
+ * and tcp_collapse() them until all the queue is collapsed.
+ */
+static void tcp_collapse_ofo_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
+	struct sk_buff *head;
+	u32 start, end;
+
+	if (skb == NULL)
+		return;
+
+	start = TCP_SKB_CB(skb)->seq;
+	end = TCP_SKB_CB(skb)->end_seq;
+	head = skb;
+
+	for (;;) {
+		skb = skb->next;
+
+		/* Segment is terminated when we see gap or when
+		 * we are at the end of all the queue. */
+		if (skb == (struct sk_buff *)&tp->out_of_order_queue ||
+		    after(TCP_SKB_CB(skb)->seq, end) ||
+		    before(TCP_SKB_CB(skb)->end_seq, start)) {
+			tcp_collapse(sk, head, skb, start, end);
+			head = skb;
+			if (skb == (struct sk_buff *)&tp->out_of_order_queue)
+				break;
+			/* Start new segment */
+			start = TCP_SKB_CB(skb)->seq;
+			end = TCP_SKB_CB(skb)->end_seq;
+		} else {
+			if (before(TCP_SKB_CB(skb)->seq, start))
+				start = TCP_SKB_CB(skb)->seq;
+			if (after(TCP_SKB_CB(skb)->end_seq, end))
+				end = TCP_SKB_CB(skb)->end_seq;
+		}
+	}
+}
+
+/* Reduce allocated memory if we can, trying to get
+ * the socket within its memory limits again.
+ *
+ * Return less than zero if we should start dropping frames
+ * until the socket owning process reads some of the data
+ * to stabilize the situation.
+ */
+static int tcp_prune_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk); 
+
+	SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq);
+
+	NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED);
+
+	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		tcp_clamp_window(sk, tp);
+	else if (tcp_memory_pressure)
+		tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
+	tcp_collapse_ofo_queue(sk);
+	tcp_collapse(sk, sk->sk_receive_queue.next,
+		     (struct sk_buff*)&sk->sk_receive_queue,
+		     tp->copied_seq, tp->rcv_nxt);
+	sk_stream_mem_reclaim(sk);
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	/* Collapsing did not help, destructive actions follow.
+	 * This must not ever occur. */
+
+	/* First, purge the out_of_order queue. */
+	if (skb_queue_len(&tp->out_of_order_queue)) {
+		NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, 
+				 skb_queue_len(&tp->out_of_order_queue));
+		__skb_queue_purge(&tp->out_of_order_queue);
+
+		/* Reset SACK state.  A conforming SACK implementation will
+		 * do the same at a timeout based retransmit.  When a connection
+		 * is in a sad state like this, we care only about integrity
+		 * of the connection not performance.
+		 */
+		if (tp->rx_opt.sack_ok)
+			tcp_sack_reset(&tp->rx_opt);
+		sk_stream_mem_reclaim(sk);
+	}
+
+	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+		return 0;
+
+	/* If we are really being abused, tell the caller to silently
+	 * drop receive data on the floor.  It will get retransmitted
+	 * and hopefully then we'll have sufficient space.
+	 */
+	NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED);
+
+	/* Massive buffer overcommit. */
+	tp->pred_flags = 0;
+	return -1;
+}
+
+
+/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto.
+ * As additional protections, we do not touch cwnd in retransmission phases,
+ * and if application hit its sndbuf limit recently.
+ */
+void tcp_cwnd_application_limited(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->ca_state == TCP_CA_Open &&
+	    sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+		/* Limited by application or receiver window. */
+		u32 win_used = max(tp->snd_cwnd_used, 2U);
+		if (win_used < tp->snd_cwnd) {
+			tp->snd_ssthresh = tcp_current_ssthresh(tp);
+			tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
+		}
+		tp->snd_cwnd_used = 0;
+	}
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+}
+
+
+/* When incoming ACK allowed to free some skb from write_queue,
+ * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket
+ * on the exit from tcp input handler.
+ *
+ * PROBLEM: sndbuf expansion does not work well with largesend.
+ */
+static void tcp_new_space(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (tp->packets_out < tp->snd_cwnd &&
+	    !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) &&
+	    !tcp_memory_pressure &&
+	    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
+ 		int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
+			MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
+		    demanded = max_t(unsigned int, tp->snd_cwnd,
+						   tp->reordering + 1);
+		sndmem *= 2*demanded;
+		if (sndmem > sk->sk_sndbuf)
+			sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
+		tp->snd_cwnd_stamp = tcp_time_stamp;
+	}
+
+	sk->sk_write_space(sk);
+}
+
+static inline void tcp_check_space(struct sock *sk)
+{
+	if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
+		sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
+		if (sk->sk_socket &&
+		    test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+			tcp_new_space(sk);
+	}
+}
+
+static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) ||
+	    tcp_packets_in_flight(tp) >= tp->snd_cwnd ||
+	    tcp_write_xmit(sk, tp->nonagle))
+		tcp_check_probe_timer(sk, tp);
+}
+
+static __inline__ void tcp_data_snd_check(struct sock *sk)
+{
+	struct sk_buff *skb = sk->sk_send_head;
+
+	if (skb != NULL)
+		__tcp_data_snd_check(sk, skb);
+	tcp_check_space(sk);
+}
+
+/*
+ * Check if sending an ack is needed.
+ */
+static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	    /* More than one full frame received... */
+	if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss
+	     /* ... and right edge of window advances far enough.
+	      * (tcp_recvmsg() will send ACK otherwise). Or...
+	      */
+	     && __tcp_select_window(sk) >= tp->rcv_wnd) ||
+	    /* We ACK each frame or... */
+	    tcp_in_quickack_mode(tp) ||
+	    /* We have out of order data. */
+	    (ofo_possible &&
+	     skb_peek(&tp->out_of_order_queue))) {
+		/* Then ack it now */
+		tcp_send_ack(sk);
+	} else {
+		/* Else, send delayed ack. */
+		tcp_send_delayed_ack(sk);
+	}
+}
+
+static __inline__ void tcp_ack_snd_check(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	if (!tcp_ack_scheduled(tp)) {
+		/* We sent a data segment already. */
+		return;
+	}
+	__tcp_ack_snd_check(sk, 1);
+}
+
+/*
+ *	This routine is only called when we have urgent data
+ *	signalled. Its the 'slow' part of tcp_urg. It could be
+ *	moved inline now as tcp_urg is only called from one
+ *	place. We handle URGent data wrong. We have to - as
+ *	BSD still doesn't use the correction from RFC961.
+ *	For 1003.1g we should support a new option TCP_STDURG to permit
+ *	either form (or just set the sysctl tcp_stdurg).
+ */
+ 
+static void tcp_check_urg(struct sock * sk, struct tcphdr * th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 ptr = ntohs(th->urg_ptr);
+
+	if (ptr && !sysctl_tcp_stdurg)
+		ptr--;
+	ptr += ntohl(th->seq);
+
+	/* Ignore urgent data that we've already seen and read. */
+	if (after(tp->copied_seq, ptr))
+		return;
+
+	/* Do not replay urg ptr.
+	 *
+	 * NOTE: interesting situation not covered by specs.
+	 * Misbehaving sender may send urg ptr, pointing to segment,
+	 * which we already have in ofo queue. We are not able to fetch
+	 * such data and will stay in TCP_URG_NOTYET until will be eaten
+	 * by recvmsg(). Seems, we are not obliged to handle such wicked
+	 * situations. But it is worth to think about possibility of some
+	 * DoSes using some hypothetical application level deadlock.
+	 */
+	if (before(ptr, tp->rcv_nxt))
+		return;
+
+	/* Do we already have a newer (or duplicate) urgent pointer? */
+	if (tp->urg_data && !after(ptr, tp->urg_seq))
+		return;
+
+	/* Tell the world about our new urgent pointer. */
+	sk_send_sigurg(sk);
+
+	/* We may be adding urgent data when the last byte read was
+	 * urgent. To do this requires some care. We cannot just ignore
+	 * tp->copied_seq since we would read the last urgent byte again
+	 * as data, nor can we alter copied_seq until this data arrives
+	 * or we break the sematics of SIOCATMARK (and thus sockatmark())
+	 *
+	 * NOTE. Double Dutch. Rendering to plain English: author of comment
+	 * above did something sort of 	send("A", MSG_OOB); send("B", MSG_OOB);
+	 * and expect that both A and B disappear from stream. This is _wrong_.
+	 * Though this happens in BSD with high probability, this is occasional.
+	 * Any application relying on this is buggy. Note also, that fix "works"
+	 * only in this artificial test. Insert some normal data between A and B and we will
+	 * decline of BSD again. Verdict: it is better to remove to trap
+	 * buggy users.
+	 */
+	if (tp->urg_seq == tp->copied_seq && tp->urg_data &&
+	    !sock_flag(sk, SOCK_URGINLINE) &&
+	    tp->copied_seq != tp->rcv_nxt) {
+		struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
+		tp->copied_seq++;
+		if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) {
+			__skb_unlink(skb, skb->list);
+			__kfree_skb(skb);
+		}
+	}
+
+	tp->urg_data   = TCP_URG_NOTYET;
+	tp->urg_seq    = ptr;
+
+	/* Disable header prediction. */
+	tp->pred_flags = 0;
+}
+
+/* This is the 'fast' part of urgent handling. */
+static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Check if we get a new urgent pointer - normally not. */
+	if (th->urg)
+		tcp_check_urg(sk,th);
+
+	/* Do we wait for any urgent data? - normally not... */
+	if (tp->urg_data == TCP_URG_NOTYET) {
+		u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) -
+			  th->syn;
+
+		/* Is the urgent pointer pointing into this packet? */	 
+		if (ptr < skb->len) {
+			u8 tmp;
+			if (skb_copy_bits(skb, ptr, &tmp, 1))
+				BUG();
+			tp->urg_data = TCP_URG_VALID | tmp;
+			if (!sock_flag(sk, SOCK_DEAD))
+				sk->sk_data_ready(sk, 0);
+		}
+	}
+}
+
+static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int chunk = skb->len - hlen;
+	int err;
+
+	local_bh_enable();
+	if (skb->ip_summed==CHECKSUM_UNNECESSARY)
+		err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk);
+	else
+		err = skb_copy_and_csum_datagram_iovec(skb, hlen,
+						       tp->ucopy.iov);
+
+	if (!err) {
+		tp->ucopy.len -= chunk;
+		tp->copied_seq += chunk;
+		tcp_rcv_space_adjust(sk);
+	}
+
+	local_bh_disable();
+	return err;
+}
+
+static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+	int result;
+
+	if (sock_owned_by_user(sk)) {
+		local_bh_enable();
+		result = __tcp_checksum_complete(skb);
+		local_bh_disable();
+	} else {
+		result = __tcp_checksum_complete(skb);
+	}
+	return result;
+}
+
+static __inline__ int
+tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb)
+{
+	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		__tcp_checksum_complete_user(sk, skb);
+}
+
+/*
+ *	TCP receive function for the ESTABLISHED state. 
+ *
+ *	It is split into a fast path and a slow path. The fast path is 
+ * 	disabled when:
+ *	- A zero window was announced from us - zero window probing
+ *        is only handled properly in the slow path. 
+ *	- Out of order segments arrived.
+ *	- Urgent data is expected.
+ *	- There is no buffer space left
+ *	- Unexpected TCP flags/window values/header lengths are received
+ *	  (detected by checking the TCP header against pred_flags) 
+ *	- Data is sent in both directions. Fast path only supports pure senders
+ *	  or pure receivers (this means either the sequence number or the ack
+ *	  value must stay constant)
+ *	- Unexpected TCP option.
+ *
+ *	When these conditions are not satisfied it drops into a standard 
+ *	receive procedure patterned after RFC793 to handle all cases.
+ *	The first three cases are guaranteed by proper pred_flags setting,
+ *	the rest is checked inline. Fast processing is turned on in 
+ *	tcp_data_queue when everything is OK.
+ */
+int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
+			struct tcphdr *th, unsigned len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/*
+	 *	Header prediction.
+	 *	The code loosely follows the one in the famous 
+	 *	"30 instruction TCP receive" Van Jacobson mail.
+	 *	
+	 *	Van's trick is to deposit buffers into socket queue 
+	 *	on a device interrupt, to call tcp_recv function
+	 *	on the receive process context and checksum and copy
+	 *	the buffer to user space. smart...
+	 *
+	 *	Our current scheme is not silly either but we take the 
+	 *	extra cost of the net_bh soft interrupt processing...
+	 *	We do checksum and copy also but from device to kernel.
+	 */
+
+	tp->rx_opt.saw_tstamp = 0;
+
+	/*	pred_flags is 0xS?10 << 16 + snd_wnd
+	 *	if header_predition is to be made
+	 *	'S' will always be tp->tcp_header_len >> 2
+	 *	'?' will be 0 for the fast path, otherwise pred_flags is 0 to
+	 *  turn it off	(when there are holes in the receive 
+	 *	 space for instance)
+	 *	PSH flag is ignored.
+	 */
+
+	if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
+		TCP_SKB_CB(skb)->seq == tp->rcv_nxt) {
+		int tcp_header_len = tp->tcp_header_len;
+
+		/* Timestamp header prediction: tcp_header_len
+		 * is automatically equal to th->doff*4 due to pred_flags
+		 * match.
+		 */
+
+		/* Check timestamp */
+		if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
+			__u32 *ptr = (__u32 *)(th + 1);
+
+			/* No? Slow path! */
+			if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+					  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
+				goto slow_path;
+
+			tp->rx_opt.saw_tstamp = 1;
+			++ptr; 
+			tp->rx_opt.rcv_tsval = ntohl(*ptr);
+			++ptr;
+			tp->rx_opt.rcv_tsecr = ntohl(*ptr);
+
+			/* If PAWS failed, check it more carefully in slow path */
+			if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0)
+				goto slow_path;
+
+			/* DO NOT update ts_recent here, if checksum fails
+			 * and timestamp was corrupted part, it will result
+			 * in a hung connection since we will drop all
+			 * future packets due to the PAWS test.
+			 */
+		}
+
+		if (len <= tcp_header_len) {
+			/* Bulk data transfer: sender */
+			if (len == tcp_header_len) {
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				tcp_rcv_rtt_measure_ts(tp, skb);
+
+				/* We know that such packets are checksummed
+				 * on entry.
+				 */
+				tcp_ack(sk, skb, 0);
+				__kfree_skb(skb); 
+				tcp_data_snd_check(sk);
+				return 0;
+			} else { /* Header too small */
+				TCP_INC_STATS_BH(TCP_MIB_INERRS);
+				goto discard;
+			}
+		} else {
+			int eaten = 0;
+
+			if (tp->ucopy.task == current &&
+			    tp->copied_seq == tp->rcv_nxt &&
+			    len - tcp_header_len <= tp->ucopy.len &&
+			    sock_owned_by_user(sk)) {
+				__set_current_state(TASK_RUNNING);
+
+				if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+					/* Predicted packet is in window by definition.
+					 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+					 * Hence, check seq<=rcv_wup reduces to:
+					 */
+					if (tcp_header_len ==
+					    (sizeof(struct tcphdr) +
+					     TCPOLEN_TSTAMP_ALIGNED) &&
+					    tp->rcv_nxt == tp->rcv_wup)
+						tcp_store_ts_recent(tp);
+
+					tcp_rcv_rtt_measure_ts(tp, skb);
+
+					__skb_pull(skb, tcp_header_len);
+					tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+					NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
+					eaten = 1;
+				}
+			}
+			if (!eaten) {
+				if (tcp_checksum_complete_user(sk, skb))
+					goto csum_error;
+
+				/* Predicted packet is in window by definition.
+				 * seq == rcv_nxt and rcv_wup <= rcv_nxt.
+				 * Hence, check seq<=rcv_wup reduces to:
+				 */
+				if (tcp_header_len ==
+				    (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) &&
+				    tp->rcv_nxt == tp->rcv_wup)
+					tcp_store_ts_recent(tp);
+
+				tcp_rcv_rtt_measure_ts(tp, skb);
+
+				if ((int)skb->truesize > sk->sk_forward_alloc)
+					goto step5;
+
+				NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS);
+
+				/* Bulk data transfer: receiver */
+				__skb_pull(skb,tcp_header_len);
+				__skb_queue_tail(&sk->sk_receive_queue, skb);
+				sk_stream_set_owner_r(skb, sk);
+				tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+			}
+
+			tcp_event_data_recv(sk, tp, skb);
+
+			if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) {
+				/* Well, only one small jumplet in fast path... */
+				tcp_ack(sk, skb, FLAG_DATA);
+				tcp_data_snd_check(sk);
+				if (!tcp_ack_scheduled(tp))
+					goto no_ack;
+			}
+
+			if (eaten) {
+				if (tcp_in_quickack_mode(tp)) {
+					tcp_send_ack(sk);
+				} else {
+					tcp_send_delayed_ack(sk);
+				}
+			} else {
+				__tcp_ack_snd_check(sk, 0);
+			}
+
+no_ack:
+			if (eaten)
+				__kfree_skb(skb);
+			else
+				sk->sk_data_ready(sk, 0);
+			return 0;
+		}
+	}
+
+slow_path:
+	if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb))
+		goto csum_error;
+
+	/*
+	 * RFC1323: H1. Apply PAWS check first.
+	 */
+	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+	    tcp_paws_discard(tp, skb)) {
+		if (!th->rst) {
+			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+			tcp_send_dupack(sk, skb);
+			goto discard;
+		}
+		/* Resets are accepted even if PAWS failed.
+
+		   ts_recent update must be made after we are sure
+		   that the packet is in window.
+		 */
+	}
+
+	/*
+	 *	Standard slow path.
+	 */
+
+	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+		/* RFC793, page 37: "In all states except SYN-SENT, all reset
+		 * (RST) segments are validated by checking their SEQ-fields."
+		 * And page 69: "If an incoming segment is not acceptable,
+		 * an acknowledgment should be sent in reply (unless the RST bit
+		 * is set, if so drop the segment and return)".
+		 */
+		if (!th->rst)
+			tcp_send_dupack(sk, skb);
+		goto discard;
+	}
+
+	if(th->rst) {
+		tcp_reset(sk);
+		goto discard;
+	}
+
+	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		TCP_INC_STATS_BH(TCP_MIB_INERRS);
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
+		tcp_reset(sk);
+		return 1;
+	}
+
+step5:
+	if(th->ack)
+		tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+	tcp_rcv_rtt_measure_ts(tp, skb);
+
+	/* Process urgent data. */
+	tcp_urg(sk, skb, th);
+
+	/* step 7: process the segment text */
+	tcp_data_queue(sk, skb);
+
+	tcp_data_snd_check(sk);
+	tcp_ack_snd_check(sk);
+	return 0;
+
+csum_error:
+	TCP_INC_STATS_BH(TCP_MIB_INERRS);
+
+discard:
+	__kfree_skb(skb);
+	return 0;
+}
+
+static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
+					 struct tcphdr *th, unsigned len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int saved_clamp = tp->rx_opt.mss_clamp;
+
+	tcp_parse_options(skb, &tp->rx_opt, 0);
+
+	if (th->ack) {
+		/* rfc793:
+		 * "If the state is SYN-SENT then
+		 *    first check the ACK bit
+		 *      If the ACK bit is set
+		 *	  If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send
+		 *        a reset (unless the RST bit is set, if so drop
+		 *        the segment and return)"
+		 *
+		 *  We do not send data with SYN, so that RFC-correct
+		 *  test reduces to:
+		 */
+		if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt)
+			goto reset_and_undo;
+
+		if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+		    !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp,
+			     tcp_time_stamp)) {
+			NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED);
+			goto reset_and_undo;
+		}
+
+		/* Now ACK is acceptable.
+		 *
+		 * "If the RST bit is set
+		 *    If the ACK was acceptable then signal the user "error:
+		 *    connection reset", drop the segment, enter CLOSED state,
+		 *    delete TCB, and return."
+		 */
+
+		if (th->rst) {
+			tcp_reset(sk);
+			goto discard;
+		}
+
+		/* rfc793:
+		 *   "fifth, if neither of the SYN or RST bits is set then
+		 *    drop the segment and return."
+		 *
+		 *    See note below!
+		 *                                        --ANK(990513)
+		 */
+		if (!th->syn)
+			goto discard_and_undo;
+
+		/* rfc793:
+		 *   "If the SYN bit is on ...
+		 *    are acceptable then ...
+		 *    (our SYN has been ACKed), change the connection
+		 *    state to ESTABLISHED..."
+		 */
+
+		TCP_ECN_rcv_synack(tp, th);
+		if (tp->ecn_flags&TCP_ECN_OK)
+			sock_set_flag(sk, SOCK_NO_LARGESEND);
+
+		tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
+		tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+		/* Ok.. it's good. Set up sequence numbers and
+		 * move to established.
+		 */
+		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+		/* RFC1323: The window in SYN & SYN/ACK segments is
+		 * never scaled.
+		 */
+		tp->snd_wnd = ntohs(th->window);
+		tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq);
+
+		if (!tp->rx_opt.wscale_ok) {
+			tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
+			tp->window_clamp = min(tp->window_clamp, 65535U);
+		}
+
+		if (tp->rx_opt.saw_tstamp) {
+			tp->rx_opt.tstamp_ok	   = 1;
+			tp->tcp_header_len =
+				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+			tp->advmss	    -= TCPOLEN_TSTAMP_ALIGNED;
+			tcp_store_ts_recent(tp);
+		} else {
+			tp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		if (tp->rx_opt.sack_ok && sysctl_tcp_fack)
+			tp->rx_opt.sack_ok |= 2;
+
+		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tcp_initialize_rcv_mss(sk);
+
+		/* Remember, tcp_poll() does not lock socket!
+		 * Change state from SYN-SENT only after copied_seq
+		 * is initialized. */
+		tp->copied_seq = tp->rcv_nxt;
+		mb();
+		tcp_set_state(sk, TCP_ESTABLISHED);
+
+		/* Make sure socket is routed, for correct metrics.  */
+		tp->af_specific->rebuild_header(sk);
+
+		tcp_init_metrics(sk);
+
+		/* Prevent spurious tcp_cwnd_restart() on first data
+		 * packet.
+		 */
+		tp->lsndtime = tcp_time_stamp;
+
+		tcp_init_buffer_space(sk);
+
+		if (sock_flag(sk, SOCK_KEEPOPEN))
+			tcp_reset_keepalive_timer(sk, keepalive_time_when(tp));
+
+		if (!tp->rx_opt.snd_wscale)
+			__tcp_fast_path_on(tp, tp->snd_wnd);
+		else
+			tp->pred_flags = 0;
+
+		if (!sock_flag(sk, SOCK_DEAD)) {
+			sk->sk_state_change(sk);
+			sk_wake_async(sk, 0, POLL_OUT);
+		}
+
+		if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) {
+			/* Save one ACK. Data will be ready after
+			 * several ticks, if write_pending is set.
+			 *
+			 * It may be deleted, but with this feature tcpdumps
+			 * look so _wonderfully_ clever, that I was not able
+			 * to stand against the temptation 8)     --ANK
+			 */
+			tcp_schedule_ack(tp);
+			tp->ack.lrcvtime = tcp_time_stamp;
+			tp->ack.ato	 = TCP_ATO_MIN;
+			tcp_incr_quickack(tp);
+			tcp_enter_quickack_mode(tp);
+			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+
+discard:
+			__kfree_skb(skb);
+			return 0;
+		} else {
+			tcp_send_ack(sk);
+		}
+		return -1;
+	}
+
+	/* No ACK in the segment */
+
+	if (th->rst) {
+		/* rfc793:
+		 * "If the RST bit is set
+		 *
+		 *      Otherwise (no ACK) drop the segment and return."
+		 */
+
+		goto discard_and_undo;
+	}
+
+	/* PAWS check. */
+	if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0))
+		goto discard_and_undo;
+
+	if (th->syn) {
+		/* We see SYN without ACK. It is attempt of
+		 * simultaneous connect with crossed SYNs.
+		 * Particularly, it can be connect to self.
+		 */
+		tcp_set_state(sk, TCP_SYN_RECV);
+
+		if (tp->rx_opt.saw_tstamp) {
+			tp->rx_opt.tstamp_ok = 1;
+			tcp_store_ts_recent(tp);
+			tp->tcp_header_len =
+				sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		} else {
+			tp->tcp_header_len = sizeof(struct tcphdr);
+		}
+
+		tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+		tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
+
+		/* RFC1323: The window in SYN & SYN/ACK segments is
+		 * never scaled.
+		 */
+		tp->snd_wnd    = ntohs(th->window);
+		tp->snd_wl1    = TCP_SKB_CB(skb)->seq;
+		tp->max_window = tp->snd_wnd;
+
+		TCP_ECN_rcv_syn(tp, th);
+		if (tp->ecn_flags&TCP_ECN_OK)
+			sock_set_flag(sk, SOCK_NO_LARGESEND);
+
+		tcp_sync_mss(sk, tp->pmtu_cookie);
+		tcp_initialize_rcv_mss(sk);
+
+
+		tcp_send_synack(sk);
+#if 0
+		/* Note, we could accept data and URG from this segment.
+		 * There are no obstacles to make this.
+		 *
+		 * However, if we ignore data in ACKless segments sometimes,
+		 * we have no reasons to accept it sometimes.
+		 * Also, seems the code doing it in step6 of tcp_rcv_state_process
+		 * is not flawless. So, discard packet for sanity.
+		 * Uncomment this return to process the data.
+		 */
+		return -1;
+#else
+		goto discard;
+#endif
+	}
+	/* "fifth, if neither of the SYN or RST bits is set then
+	 * drop the segment and return."
+	 */
+
+discard_and_undo:
+	tcp_clear_options(&tp->rx_opt);
+	tp->rx_opt.mss_clamp = saved_clamp;
+	goto discard;
+
+reset_and_undo:
+	tcp_clear_options(&tp->rx_opt);
+	tp->rx_opt.mss_clamp = saved_clamp;
+	return 1;
+}
+
+
+/*
+ *	This function implements the receiving procedure of RFC 793 for
+ *	all states except ESTABLISHED and TIME_WAIT. 
+ *	It's called from both tcp_v4_rcv and tcp_v6_rcv and should be
+ *	address independent.
+ */
+	
+int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
+			  struct tcphdr *th, unsigned len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int queued = 0;
+
+	tp->rx_opt.saw_tstamp = 0;
+
+	switch (sk->sk_state) {
+	case TCP_CLOSE:
+		goto discard;
+
+	case TCP_LISTEN:
+		if(th->ack)
+			return 1;
+
+		if(th->rst)
+			goto discard;
+
+		if(th->syn) {
+			if(tp->af_specific->conn_request(sk, skb) < 0)
+				return 1;
+
+			init_westwood(sk);
+			init_bictcp(tp);
+
+			/* Now we have several options: In theory there is 
+			 * nothing else in the frame. KA9Q has an option to 
+			 * send data with the syn, BSD accepts data with the
+			 * syn up to the [to be] advertised window and 
+			 * Solaris 2.1 gives you a protocol error. For now 
+			 * we just ignore it, that fits the spec precisely 
+			 * and avoids incompatibilities. It would be nice in
+			 * future to drop through and process the data.
+			 *
+			 * Now that TTCP is starting to be used we ought to 
+			 * queue this data.
+			 * But, this leaves one open to an easy denial of
+		 	 * service attack, and SYN cookies can't defend
+			 * against this problem. So, we drop the data
+			 * in the interest of security over speed.
+			 */
+			goto discard;
+		}
+		goto discard;
+
+	case TCP_SYN_SENT:
+		init_westwood(sk);
+		init_bictcp(tp);
+
+		queued = tcp_rcv_synsent_state_process(sk, skb, th, len);
+		if (queued >= 0)
+			return queued;
+
+		/* Do step6 onward by hand. */
+		tcp_urg(sk, skb, th);
+		__kfree_skb(skb);
+		tcp_data_snd_check(sk);
+		return 0;
+	}
+
+	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+	    tcp_paws_discard(tp, skb)) {
+		if (!th->rst) {
+			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+			tcp_send_dupack(sk, skb);
+			goto discard;
+		}
+		/* Reset is accepted even if it did not pass PAWS. */
+	}
+
+	/* step 1: check sequence number */
+	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
+		if (!th->rst)
+			tcp_send_dupack(sk, skb);
+		goto discard;
+	}
+
+	/* step 2: check RST bit */
+	if(th->rst) {
+		tcp_reset(sk);
+		goto discard;
+	}
+
+	tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq);
+
+	/* step 3: check security and precedence [ignored] */
+
+	/*	step 4:
+	 *
+	 *	Check for a SYN in window.
+	 */
+	if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN);
+		tcp_reset(sk);
+		return 1;
+	}
+
+	/* step 5: check the ACK field */
+	if (th->ack) {
+		int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH);
+
+		switch(sk->sk_state) {
+		case TCP_SYN_RECV:
+			if (acceptable) {
+				tp->copied_seq = tp->rcv_nxt;
+				mb();
+				tcp_set_state(sk, TCP_ESTABLISHED);
+				sk->sk_state_change(sk);
+
+				/* Note, that this wakeup is only for marginal
+				 * crossed SYN case. Passively open sockets
+				 * are not waked up, because sk->sk_sleep ==
+				 * NULL and sk->sk_socket == NULL.
+				 */
+				if (sk->sk_socket) {
+					sk_wake_async(sk,0,POLL_OUT);
+				}
+
+				tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
+				tp->snd_wnd = ntohs(th->window) <<
+					      tp->rx_opt.snd_wscale;
+				tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq,
+					    TCP_SKB_CB(skb)->seq);
+
+				/* tcp_ack considers this ACK as duplicate
+				 * and does not calculate rtt.
+				 * Fix it at least with timestamps.
+				 */
+				if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
+				    !tp->srtt)
+					tcp_ack_saw_tstamp(tp, 0);
+
+				if (tp->rx_opt.tstamp_ok)
+					tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
+
+				/* Make sure socket is routed, for
+				 * correct metrics.
+				 */
+				tp->af_specific->rebuild_header(sk);
+
+				tcp_init_metrics(sk);
+
+				/* Prevent spurious tcp_cwnd_restart() on
+				 * first data packet.
+				 */
+				tp->lsndtime = tcp_time_stamp;
+
+				tcp_initialize_rcv_mss(sk);
+				tcp_init_buffer_space(sk);
+				tcp_fast_path_on(tp);
+			} else {
+				return 1;
+			}
+			break;
+
+		case TCP_FIN_WAIT1:
+			if (tp->snd_una == tp->write_seq) {
+				tcp_set_state(sk, TCP_FIN_WAIT2);
+				sk->sk_shutdown |= SEND_SHUTDOWN;
+				dst_confirm(sk->sk_dst_cache);
+
+				if (!sock_flag(sk, SOCK_DEAD))
+					/* Wake up lingering close() */
+					sk->sk_state_change(sk);
+				else {
+					int tmo;
+
+					if (tp->linger2 < 0 ||
+					    (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+					     after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) {
+						tcp_done(sk);
+						NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
+						return 1;
+					}
+
+					tmo = tcp_fin_time(tp);
+					if (tmo > TCP_TIMEWAIT_LEN) {
+						tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN);
+					} else if (th->fin || sock_owned_by_user(sk)) {
+						/* Bad case. We could lose such FIN otherwise.
+						 * It is not a big problem, but it looks confusing
+						 * and not so rare event. We still can lose it now,
+						 * if it spins in bh_lock_sock(), but it is really
+						 * marginal case.
+						 */
+						tcp_reset_keepalive_timer(sk, tmo);
+					} else {
+						tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+						goto discard;
+					}
+				}
+			}
+			break;
+
+		case TCP_CLOSING:
+			if (tp->snd_una == tp->write_seq) {
+				tcp_time_wait(sk, TCP_TIME_WAIT, 0);
+				goto discard;
+			}
+			break;
+
+		case TCP_LAST_ACK:
+			if (tp->snd_una == tp->write_seq) {
+				tcp_update_metrics(sk);
+				tcp_done(sk);
+				goto discard;
+			}
+			break;
+		}
+	} else
+		goto discard;
+
+	/* step 6: check the URG bit */
+	tcp_urg(sk, skb, th);
+
+	/* step 7: process the segment text */
+	switch (sk->sk_state) {
+	case TCP_CLOSE_WAIT:
+	case TCP_CLOSING:
+	case TCP_LAST_ACK:
+		if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt))
+			break;
+	case TCP_FIN_WAIT1:
+	case TCP_FIN_WAIT2:
+		/* RFC 793 says to queue data in these states,
+		 * RFC 1122 says we MUST send a reset. 
+		 * BSD 4.4 also does reset.
+		 */
+		if (sk->sk_shutdown & RCV_SHUTDOWN) {
+			if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
+			    after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) {
+				NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA);
+				tcp_reset(sk);
+				return 1;
+			}
+		}
+		/* Fall through */
+	case TCP_ESTABLISHED: 
+		tcp_data_queue(sk, skb);
+		queued = 1;
+		break;
+	}
+
+	/* tcp_data could move socket to TIME-WAIT */
+	if (sk->sk_state != TCP_CLOSE) {
+		tcp_data_snd_check(sk);
+		tcp_ack_snd_check(sk);
+	}
+
+	if (!queued) { 
+discard:
+		__kfree_skb(skb);
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(sysctl_tcp_ecn);
+EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(tcp_parse_options);
+EXPORT_SYMBOL(tcp_rcv_established);
+EXPORT_SYMBOL(tcp_rcv_state_process);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
new file mode 100644
index 000000000000..3ac6659869c4
--- /dev/null
+++ b/net/ipv4/tcp_ipv4.c
@@ -0,0 +1,2663 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
+ *
+ *		IPv4 specific functions
+ *
+ *
+ *		code split from:
+ *		linux/ipv4/tcp.c
+ *		linux/ipv4/tcp_input.c
+ *		linux/ipv4/tcp_output.c
+ *
+ *		See tcp.c for author information
+ *
+ *	This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Changes:
+ *		David S. Miller	:	New socket lookup architecture.
+ *					This code is dedicated to John Dyson.
+ *		David S. Miller :	Change semantics of established hash,
+ *					half is devoted to TIME_WAIT sockets
+ *					and the rest go in the other half.
+ *		Andi Kleen :		Add support for syncookies and fixed
+ *					some bugs: ip options weren't passed to
+ *					the TCP layer, missed a check for an
+ *					ACK bit.
+ *		Andi Kleen :		Implemented fast path mtu discovery.
+ *	     				Fixed many serious bugs in the
+ *					open_request handling and moved
+ *					most of it into the af independent code.
+ *					Added tail drop and some other bugfixes.
+ *					Added new listen sematics.
+ *		Mike McLagan	:	Routing by source
+ *	Juan Jose Ciarlante:		ip_dynaddr bits
+ *		Andi Kleen:		various fixes.
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year
+ *					coma.
+ *	Andi Kleen		:	Fix new listen.
+ *	Andi Kleen		:	Fix accept error reporting.
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ */
+
+#include <linux/config.h>
+
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/cache.h>
+#include <linux/jhash.h>
+#include <linux/init.h>
+#include <linux/times.h>
+
+#include <net/icmp.h>
+#include <net/tcp.h>
+#include <net/ipv6.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/stddef.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+extern int sysctl_ip_dynaddr;
+int sysctl_tcp_tw_reuse;
+int sysctl_tcp_low_latency;
+
+/* Check TCP sequence numbers in ICMP packets. */
+#define ICMP_MIN_LENGTH 8
+
+/* Socket used for sending RSTs */
+static struct socket *tcp_socket;
+
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+		       struct sk_buff *skb);
+
+struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
+	.__tcp_lhash_lock	=	RW_LOCK_UNLOCKED,
+	.__tcp_lhash_users	=	ATOMIC_INIT(0),
+	.__tcp_lhash_wait
+	  = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
+	.__tcp_portalloc_lock	=	SPIN_LOCK_UNLOCKED
+};
+
+/*
+ * This array holds the first and last local port number.
+ * For high-usage systems, use sysctl to change this to
+ * 32768-61000
+ */
+int sysctl_local_port_range[2] = { 1024, 4999 };
+int tcp_port_rover = 1024 - 1;
+
+static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
+				 __u32 faddr, __u16 fport)
+{
+	int h = (laddr ^ lport) ^ (faddr ^ fport);
+	h ^= h >> 16;
+	h ^= h >> 8;
+	return h & (tcp_ehash_size - 1);
+}
+
+static __inline__ int tcp_sk_hashfn(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	__u32 laddr = inet->rcv_saddr;
+	__u16 lport = inet->num;
+	__u32 faddr = inet->daddr;
+	__u16 fport = inet->dport;
+
+	return tcp_hashfn(laddr, lport, faddr, fport);
+}
+
+/* Allocate and initialize a new TCP local port bind bucket.
+ * The bindhash mutex for snum's hash chain must be held here.
+ */
+struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
+					  unsigned short snum)
+{
+	struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep,
+						      SLAB_ATOMIC);
+	if (tb) {
+		tb->port = snum;
+		tb->fastreuse = 0;
+		INIT_HLIST_HEAD(&tb->owners);
+		hlist_add_head(&tb->node, &head->chain);
+	}
+	return tb;
+}
+
+/* Caller must hold hashbucket lock for this tb with local BH disabled */
+void tcp_bucket_destroy(struct tcp_bind_bucket *tb)
+{
+	if (hlist_empty(&tb->owners)) {
+		__hlist_del(&tb->node);
+		kmem_cache_free(tcp_bucket_cachep, tb);
+	}
+}
+
+/* Caller must disable local BH processing. */
+static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+	struct tcp_bind_hashbucket *head =
+				&tcp_bhash[tcp_bhashfn(inet_sk(child)->num)];
+	struct tcp_bind_bucket *tb;
+
+	spin_lock(&head->lock);
+	tb = tcp_sk(sk)->bind_hash;
+	sk_add_bind_node(child, &tb->owners);
+	tcp_sk(child)->bind_hash = tb;
+	spin_unlock(&head->lock);
+}
+
+inline void tcp_inherit_port(struct sock *sk, struct sock *child)
+{
+	local_bh_disable();
+	__tcp_inherit_port(sk, child);
+	local_bh_enable();
+}
+
+void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb,
+		   unsigned short snum)
+{
+	inet_sk(sk)->num = snum;
+	sk_add_bind_node(sk, &tb->owners);
+	tcp_sk(sk)->bind_hash = tb;
+}
+
+static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
+{
+	const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
+	struct sock *sk2;
+	struct hlist_node *node;
+	int reuse = sk->sk_reuse;
+
+	sk_for_each_bound(sk2, node, &tb->owners) {
+		if (sk != sk2 &&
+		    !tcp_v6_ipv6only(sk2) &&
+		    (!sk->sk_bound_dev_if ||
+		     !sk2->sk_bound_dev_if ||
+		     sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
+			if (!reuse || !sk2->sk_reuse ||
+			    sk2->sk_state == TCP_LISTEN) {
+				const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
+				if (!sk2_rcv_saddr || !sk_rcv_saddr ||
+				    sk2_rcv_saddr == sk_rcv_saddr)
+					break;
+			}
+		}
+	}
+	return node != NULL;
+}
+
+/* Obtain a reference to a local port for the given sock,
+ * if snum is zero it means select any available local port.
+ */
+static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+	struct tcp_bind_hashbucket *head;
+	struct hlist_node *node;
+	struct tcp_bind_bucket *tb;
+	int ret;
+
+	local_bh_disable();
+	if (!snum) {
+		int low = sysctl_local_port_range[0];
+		int high = sysctl_local_port_range[1];
+		int remaining = (high - low) + 1;
+		int rover;
+
+		spin_lock(&tcp_portalloc_lock);
+		rover = tcp_port_rover;
+		do {
+			rover++;
+			if (rover < low || rover > high)
+				rover = low;
+			head = &tcp_bhash[tcp_bhashfn(rover)];
+			spin_lock(&head->lock);
+			tb_for_each(tb, node, &head->chain)
+				if (tb->port == rover)
+					goto next;
+			break;
+		next:
+			spin_unlock(&head->lock);
+		} while (--remaining > 0);
+		tcp_port_rover = rover;
+		spin_unlock(&tcp_portalloc_lock);
+
+		/* Exhausted local port range during search? */
+		ret = 1;
+		if (remaining <= 0)
+			goto fail;
+
+		/* OK, here is the one we will use.  HEAD is
+		 * non-NULL and we hold it's mutex.
+		 */
+		snum = rover;
+	} else {
+		head = &tcp_bhash[tcp_bhashfn(snum)];
+		spin_lock(&head->lock);
+		tb_for_each(tb, node, &head->chain)
+			if (tb->port == snum)
+				goto tb_found;
+	}
+	tb = NULL;
+	goto tb_not_found;
+tb_found:
+	if (!hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse > 1)
+			goto success;
+		if (tb->fastreuse > 0 &&
+		    sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
+			goto success;
+		} else {
+			ret = 1;
+			if (tcp_bind_conflict(sk, tb))
+				goto fail_unlock;
+		}
+	}
+tb_not_found:
+	ret = 1;
+	if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL)
+		goto fail_unlock;
+	if (hlist_empty(&tb->owners)) {
+		if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
+			tb->fastreuse = 1;
+		else
+			tb->fastreuse = 0;
+	} else if (tb->fastreuse &&
+		   (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
+		tb->fastreuse = 0;
+success:
+	if (!tcp_sk(sk)->bind_hash)
+		tcp_bind_hash(sk, tb, snum);
+	BUG_TRAP(tcp_sk(sk)->bind_hash == tb);
+ 	ret = 0;
+
+fail_unlock:
+	spin_unlock(&head->lock);
+fail:
+	local_bh_enable();
+	return ret;
+}
+
+/* Get rid of any references to a local port held by the
+ * given sock.
+ */
+static void __tcp_put_port(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)];
+	struct tcp_bind_bucket *tb;
+
+	spin_lock(&head->lock);
+	tb = tcp_sk(sk)->bind_hash;
+	__sk_del_bind_node(sk);
+	tcp_sk(sk)->bind_hash = NULL;
+	inet->num = 0;
+	tcp_bucket_destroy(tb);
+	spin_unlock(&head->lock);
+}
+
+void tcp_put_port(struct sock *sk)
+{
+	local_bh_disable();
+	__tcp_put_port(sk);
+	local_bh_enable();
+}
+
+/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
+ * Look, when several writers sleep and reader wakes them up, all but one
+ * immediately hit write lock and grab all the cpus. Exclusive sleep solves
+ * this, _but_ remember, it adds useless work on UP machines (wake up each
+ * exclusive lock release). It should be ifdefed really.
+ */
+
+void tcp_listen_wlock(void)
+{
+	write_lock(&tcp_lhash_lock);
+
+	if (atomic_read(&tcp_lhash_users)) {
+		DEFINE_WAIT(wait);
+
+		for (;;) {
+			prepare_to_wait_exclusive(&tcp_lhash_wait,
+						&wait, TASK_UNINTERRUPTIBLE);
+			if (!atomic_read(&tcp_lhash_users))
+				break;
+			write_unlock_bh(&tcp_lhash_lock);
+			schedule();
+			write_lock_bh(&tcp_lhash_lock);
+		}
+
+		finish_wait(&tcp_lhash_wait, &wait);
+	}
+}
+
+static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
+{
+	struct hlist_head *list;
+	rwlock_t *lock;
+
+	BUG_TRAP(sk_unhashed(sk));
+	if (listen_possible && sk->sk_state == TCP_LISTEN) {
+		list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
+		lock = &tcp_lhash_lock;
+		tcp_listen_wlock();
+	} else {
+		list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain;
+		lock = &tcp_ehash[sk->sk_hashent].lock;
+		write_lock(lock);
+	}
+	__sk_add_node(sk, list);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(lock);
+	if (listen_possible && sk->sk_state == TCP_LISTEN)
+		wake_up(&tcp_lhash_wait);
+}
+
+static void tcp_v4_hash(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		local_bh_disable();
+		__tcp_v4_hash(sk, 1);
+		local_bh_enable();
+	}
+}
+
+void tcp_unhash(struct sock *sk)
+{
+	rwlock_t *lock;
+
+	if (sk_unhashed(sk))
+		goto ende;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		local_bh_disable();
+		tcp_listen_wlock();
+		lock = &tcp_lhash_lock;
+	} else {
+		struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent];
+		lock = &head->lock;
+		write_lock_bh(&head->lock);
+	}
+
+	if (__sk_del_node_init(sk))
+		sock_prot_dec_use(sk->sk_prot);
+	write_unlock_bh(lock);
+
+ ende:
+	if (sk->sk_state == TCP_LISTEN)
+		wake_up(&tcp_lhash_wait);
+}
+
+/* Don't inline this cruft.  Here are some nice properties to
+ * exploit here.  The BSD API does not allow a listening TCP
+ * to specify the remote port nor the remote address for the
+ * connection.  So always assume those are both wildcarded
+ * during the search since they can never be otherwise.
+ */
+static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr,
+					     unsigned short hnum, int dif)
+{
+	struct sock *result = NULL, *sk;
+	struct hlist_node *node;
+	int score, hiscore;
+
+	hiscore=-1;
+	sk_for_each(sk, node, head) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (inet->num == hnum && !ipv6_only_sock(sk)) {
+			__u32 rcv_saddr = inet->rcv_saddr;
+
+			score = (sk->sk_family == PF_INET ? 1 : 0);
+			if (rcv_saddr) {
+				if (rcv_saddr != daddr)
+					continue;
+				score+=2;
+			}
+			if (sk->sk_bound_dev_if) {
+				if (sk->sk_bound_dev_if != dif)
+					continue;
+				score+=2;
+			}
+			if (score == 5)
+				return sk;
+			if (score > hiscore) {
+				hiscore = score;
+				result = sk;
+			}
+		}
+	}
+	return result;
+}
+
+/* Optimize the common listener case. */
+static inline struct sock *tcp_v4_lookup_listener(u32 daddr,
+		unsigned short hnum, int dif)
+{
+	struct sock *sk = NULL;
+	struct hlist_head *head;
+
+	read_lock(&tcp_lhash_lock);
+	head = &tcp_listening_hash[tcp_lhashfn(hnum)];
+	if (!hlist_empty(head)) {
+		struct inet_sock *inet = inet_sk((sk = __sk_head(head)));
+
+		if (inet->num == hnum && !sk->sk_node.next &&
+		    (!inet->rcv_saddr || inet->rcv_saddr == daddr) &&
+		    (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) &&
+		    !sk->sk_bound_dev_if)
+			goto sherry_cache;
+		sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif);
+	}
+	if (sk) {
+sherry_cache:
+		sock_hold(sk);
+	}
+	read_unlock(&tcp_lhash_lock);
+	return sk;
+}
+
+/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
+ * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
+ *
+ * Local BH must be disabled here.
+ */
+
+static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
+						       u32 daddr, u16 hnum,
+						       int dif)
+{
+	struct tcp_ehash_bucket *head;
+	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
+	struct sock *sk;
+	struct hlist_node *node;
+	/* Optimize here for direct hit, only listening connections can
+	 * have wildcards anyways.
+	 */
+	int hash = tcp_hashfn(daddr, hnum, saddr, sport);
+	head = &tcp_ehash[hash];
+	read_lock(&head->lock);
+	sk_for_each(sk, node, &head->chain) {
+		if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
+			goto hit; /* You sunk my battleship! */
+	}
+
+	/* Must check for a TIME_WAIT'er before going to listener hash. */
+	sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) {
+		if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
+			goto hit;
+	}
+	sk = NULL;
+out:
+	read_unlock(&head->lock);
+	return sk;
+hit:
+	sock_hold(sk);
+	goto out;
+}
+
+static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
+					   u32 daddr, u16 hnum, int dif)
+{
+	struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
+						      daddr, hnum, dif);
+
+	return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);
+}
+
+inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
+				  u16 dport, int dif)
+{
+	struct sock *sk;
+
+	local_bh_disable();
+	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
+	local_bh_enable();
+
+	return sk;
+}
+
+EXPORT_SYMBOL_GPL(tcp_v4_lookup);
+
+static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
+{
+	return secure_tcp_sequence_number(skb->nh.iph->daddr,
+					  skb->nh.iph->saddr,
+					  skb->h.th->dest,
+					  skb->h.th->source);
+}
+
+/* called with local bh disabled */
+static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
+				      struct tcp_tw_bucket **twp)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	u32 daddr = inet->rcv_saddr;
+	u32 saddr = inet->daddr;
+	int dif = sk->sk_bound_dev_if;
+	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
+	__u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
+	int hash = tcp_hashfn(daddr, lport, saddr, inet->dport);
+	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
+	struct sock *sk2;
+	struct hlist_node *node;
+	struct tcp_tw_bucket *tw;
+
+	write_lock(&head->lock);
+
+	/* Check TIME-WAIT sockets first. */
+	sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) {
+		tw = (struct tcp_tw_bucket *)sk2;
+
+		if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+			struct tcp_sock *tp = tcp_sk(sk);
+
+			/* With PAWS, it is safe from the viewpoint
+			   of data integrity. Even without PAWS it
+			   is safe provided sequence spaces do not
+			   overlap i.e. at data rates <= 80Mbit/sec.
+
+			   Actually, the idea is close to VJ's one,
+			   only timestamp cache is held not per host,
+			   but per port pair and TW bucket is used
+			   as state holder.
+
+			   If TW bucket has been already destroyed we
+			   fall back to VJ's scheme and use initial
+			   timestamp retrieved from peer table.
+			 */
+			if (tw->tw_ts_recent_stamp &&
+			    (!twp || (sysctl_tcp_tw_reuse &&
+				      xtime.tv_sec -
+				      tw->tw_ts_recent_stamp > 1))) {
+				if ((tp->write_seq =
+						tw->tw_snd_nxt + 65535 + 2) == 0)
+					tp->write_seq = 1;
+				tp->rx_opt.ts_recent	   = tw->tw_ts_recent;
+				tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+				sock_hold(sk2);
+				goto unique;
+			} else
+				goto not_unique;
+		}
+	}
+	tw = NULL;
+
+	/* And established part... */
+	sk_for_each(sk2, node, &head->chain) {
+		if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+			goto not_unique;
+	}
+
+unique:
+	/* Must record num and sport now. Otherwise we will see
+	 * in hash table socket with a funny identity. */
+	inet->num = lport;
+	inet->sport = htons(lport);
+	sk->sk_hashent = hash;
+	BUG_TRAP(sk_unhashed(sk));
+	__sk_add_node(sk, &head->chain);
+	sock_prot_inc_use(sk->sk_prot);
+	write_unlock(&head->lock);
+
+	if (twp) {
+		*twp = tw;
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+	} else if (tw) {
+		/* Silly. Should hash-dance instead... */
+		tcp_tw_deschedule(tw);
+		NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
+
+		tcp_tw_put(tw);
+	}
+
+	return 0;
+
+not_unique:
+	write_unlock(&head->lock);
+	return -EADDRNOTAVAIL;
+}
+
+static inline u32 connect_port_offset(const struct sock *sk)
+{
+	const struct inet_sock *inet = inet_sk(sk);
+
+	return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, 
+					 inet->dport);
+}
+
+/*
+ * Bind a port for a connect operation and hash it.
+ */
+static inline int tcp_v4_hash_connect(struct sock *sk)
+{
+	unsigned short snum = inet_sk(sk)->num;
+ 	struct tcp_bind_hashbucket *head;
+ 	struct tcp_bind_bucket *tb;
+	int ret;
+
+ 	if (!snum) {
+ 		int low = sysctl_local_port_range[0];
+ 		int high = sysctl_local_port_range[1];
+		int range = high - low;
+ 		int i;
+		int port;
+		static u32 hint;
+		u32 offset = hint + connect_port_offset(sk);
+		struct hlist_node *node;
+ 		struct tcp_tw_bucket *tw = NULL;
+
+ 		local_bh_disable();
+		for (i = 1; i <= range; i++) {
+			port = low + (i + offset) % range;
+ 			head = &tcp_bhash[tcp_bhashfn(port)];
+ 			spin_lock(&head->lock);
+
+ 			/* Does not bother with rcv_saddr checks,
+ 			 * because the established check is already
+ 			 * unique enough.
+ 			 */
+			tb_for_each(tb, node, &head->chain) {
+ 				if (tb->port == port) {
+ 					BUG_TRAP(!hlist_empty(&tb->owners));
+ 					if (tb->fastreuse >= 0)
+ 						goto next_port;
+ 					if (!__tcp_v4_check_established(sk,
+									port,
+									&tw))
+ 						goto ok;
+ 					goto next_port;
+ 				}
+ 			}
+
+ 			tb = tcp_bucket_create(head, port);
+ 			if (!tb) {
+ 				spin_unlock(&head->lock);
+ 				break;
+ 			}
+ 			tb->fastreuse = -1;
+ 			goto ok;
+
+ 		next_port:
+ 			spin_unlock(&head->lock);
+ 		}
+ 		local_bh_enable();
+
+ 		return -EADDRNOTAVAIL;
+
+ok:
+		hint += i;
+
+ 		/* Head lock still held and bh's disabled */
+ 		tcp_bind_hash(sk, tb, port);
+		if (sk_unhashed(sk)) {
+ 			inet_sk(sk)->sport = htons(port);
+ 			__tcp_v4_hash(sk, 0);
+ 		}
+ 		spin_unlock(&head->lock);
+
+ 		if (tw) {
+ 			tcp_tw_deschedule(tw);
+ 			tcp_tw_put(tw);
+ 		}
+
+		ret = 0;
+		goto out;
+ 	}
+
+ 	head  = &tcp_bhash[tcp_bhashfn(snum)];
+ 	tb  = tcp_sk(sk)->bind_hash;
+	spin_lock_bh(&head->lock);
+	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+		__tcp_v4_hash(sk, 0);
+		spin_unlock_bh(&head->lock);
+		return 0;
+	} else {
+		spin_unlock(&head->lock);
+		/* No definite answer... Walk to established hash table */
+		ret = __tcp_v4_check_established(sk, snum, NULL);
+out:
+		local_bh_enable();
+		return ret;
+	}
+}
+
+/* This will initiate an outgoing connection. */
+int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
+	struct rtable *rt;
+	u32 daddr, nexthop;
+	int tmp;
+	int err;
+
+	if (addr_len < sizeof(struct sockaddr_in))
+		return -EINVAL;
+
+	if (usin->sin_family != AF_INET)
+		return -EAFNOSUPPORT;
+
+	nexthop = daddr = usin->sin_addr.s_addr;
+	if (inet->opt && inet->opt->srr) {
+		if (!daddr)
+			return -EINVAL;
+		nexthop = inet->opt->faddr;
+	}
+
+	tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+			       RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
+			       IPPROTO_TCP,
+			       inet->sport, usin->sin_port, sk);
+	if (tmp < 0)
+		return tmp;
+
+	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
+		ip_rt_put(rt);
+		return -ENETUNREACH;
+	}
+
+	if (!inet->opt || !inet->opt->srr)
+		daddr = rt->rt_dst;
+
+	if (!inet->saddr)
+		inet->saddr = rt->rt_src;
+	inet->rcv_saddr = inet->saddr;
+
+	if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
+		/* Reset inherited state */
+		tp->rx_opt.ts_recent	   = 0;
+		tp->rx_opt.ts_recent_stamp = 0;
+		tp->write_seq		   = 0;
+	}
+
+	if (sysctl_tcp_tw_recycle &&
+	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
+		struct inet_peer *peer = rt_get_peer(rt);
+
+		/* VJ's idea. We save last timestamp seen from
+		 * the destination in peer table, when entering state TIME-WAIT
+		 * and initialize rx_opt.ts_recent from it, when trying new connection.
+		 */
+
+		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
+			tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
+			tp->rx_opt.ts_recent = peer->tcp_ts;
+		}
+	}
+
+	inet->dport = usin->sin_port;
+	inet->daddr = daddr;
+
+	tp->ext_header_len = 0;
+	if (inet->opt)
+		tp->ext_header_len = inet->opt->optlen;
+
+	tp->rx_opt.mss_clamp = 536;
+
+	/* Socket identity is still unknown (sport may be zero).
+	 * However we set state to SYN-SENT and not releasing socket
+	 * lock select source port, enter ourselves into the hash tables and
+	 * complete initialization after this.
+	 */
+	tcp_set_state(sk, TCP_SYN_SENT);
+	err = tcp_v4_hash_connect(sk);
+	if (err)
+		goto failure;
+
+	err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
+	if (err)
+		goto failure;
+
+	/* OK, now commit destination to socket.  */
+	__sk_dst_set(sk, &rt->u.dst);
+	tcp_v4_setup_caps(sk, &rt->u.dst);
+
+	if (!tp->write_seq)
+		tp->write_seq = secure_tcp_sequence_number(inet->saddr,
+							   inet->daddr,
+							   inet->sport,
+							   usin->sin_port);
+
+	inet->id = tp->write_seq ^ jiffies;
+
+	err = tcp_connect(sk);
+	rt = NULL;
+	if (err)
+		goto failure;
+
+	return 0;
+
+failure:
+	/* This unhashes the socket and releases the local port, if necessary. */
+	tcp_set_state(sk, TCP_CLOSE);
+	ip_rt_put(rt);
+	sk->sk_route_caps = 0;
+	inet->dport = 0;
+	return err;
+}
+
+static __inline__ int tcp_v4_iif(struct sk_buff *skb)
+{
+	return ((struct rtable *)skb->dst)->rt_iif;
+}
+
+static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
+{
+	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
+}
+
+static struct open_request *tcp_v4_search_req(struct tcp_sock *tp,
+					      struct open_request ***prevp,
+					      __u16 rport,
+					      __u32 raddr, __u32 laddr)
+{
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	struct open_request *req, **prev;
+
+	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
+	     (req = *prev) != NULL;
+	     prev = &req->dl_next) {
+		if (req->rmt_port == rport &&
+		    req->af.v4_req.rmt_addr == raddr &&
+		    req->af.v4_req.loc_addr == laddr &&
+		    TCP_INET_FAMILY(req->class->family)) {
+			BUG_TRAP(!req->sk);
+			*prevp = prev;
+			break;
+		}
+	}
+
+	return req;
+}
+
+static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
+
+	req->expires = jiffies + TCP_TIMEOUT_INIT;
+	req->retrans = 0;
+	req->sk = NULL;
+	req->dl_next = lopt->syn_table[h];
+
+	write_lock(&tp->syn_wait_lock);
+	lopt->syn_table[h] = req;
+	write_unlock(&tp->syn_wait_lock);
+
+	tcp_synq_added(sk);
+}
+
+
+/*
+ * This routine does path mtu discovery as defined in RFC1191.
+ */
+static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
+				     u32 mtu)
+{
+	struct dst_entry *dst;
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
+	 * send out by Linux are always <576bytes so they should go through
+	 * unfragmented).
+	 */
+	if (sk->sk_state == TCP_LISTEN)
+		return;
+
+	/* We don't check in the destentry if pmtu discovery is forbidden
+	 * on this route. We just assume that no packet_to_big packets
+	 * are send back when pmtu discovery is not active.
+     	 * There is a small race when the user changes this flag in the
+	 * route, but I think that's acceptable.
+	 */
+	if ((dst = __sk_dst_check(sk, 0)) == NULL)
+		return;
+
+	dst->ops->update_pmtu(dst, mtu);
+
+	/* Something is about to be wrong... Remember soft error
+	 * for the case, if this connection will not able to recover.
+	 */
+	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+		sk->sk_err_soft = EMSGSIZE;
+
+	mtu = dst_mtu(dst);
+
+	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
+	    tp->pmtu_cookie > mtu) {
+		tcp_sync_mss(sk, mtu);
+
+		/* Resend the TCP packet because it's
+		 * clear that the old packet has been
+		 * dropped. This is the new "fast" path mtu
+		 * discovery.
+		 */
+		tcp_simple_retransmit(sk);
+	} /* else let the usual retransmit timer handle it */
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  After adjustment
+ * header points to the first 8 bytes of the tcp header.  We need
+ * to find the appropriate port.
+ *
+ * The locking strategy used here is very "optimistic". When
+ * someone else accesses the socket the ICMP is just dropped
+ * and for some paths there is no check at all.
+ * A more general error queue to queue errors for later handling
+ * is probably better.
+ *
+ */
+
+void tcp_v4_err(struct sk_buff *skb, u32 info)
+{
+	struct iphdr *iph = (struct iphdr *)skb->data;
+	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
+	struct tcp_sock *tp;
+	struct inet_sock *inet;
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	struct sock *sk;
+	__u32 seq;
+	int err;
+
+	if (skb->len < (iph->ihl << 2) + 8) {
+		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+		return;
+	}
+
+	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
+			   th->source, tcp_v4_iif(skb));
+	if (!sk) {
+		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+		return;
+	}
+	if (sk->sk_state == TCP_TIME_WAIT) {
+		tcp_tw_put((struct tcp_tw_bucket *)sk);
+		return;
+	}
+
+	bh_lock_sock(sk);
+	/* If too many ICMPs get dropped on busy
+	 * servers this needs to be solved differently.
+	 */
+	if (sock_owned_by_user(sk))
+		NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
+
+	if (sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	tp = tcp_sk(sk);
+	seq = ntohl(th->seq);
+	if (sk->sk_state != TCP_LISTEN &&
+	    !between(seq, tp->snd_una, tp->snd_nxt)) {
+		NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
+		goto out;
+	}
+
+	switch (type) {
+	case ICMP_SOURCE_QUENCH:
+		/* Just silently ignore these. */
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code > NR_ICMP_UNREACH)
+			goto out;
+
+		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
+			if (!sock_owned_by_user(sk))
+				do_pmtu_discovery(sk, iph, info);
+			goto out;
+		}
+
+		err = icmp_err_convert[code].errno;
+		break;
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	default:
+		goto out;
+	}
+
+	switch (sk->sk_state) {
+		struct open_request *req, **prev;
+	case TCP_LISTEN:
+		if (sock_owned_by_user(sk))
+			goto out;
+
+		req = tcp_v4_search_req(tp, &prev, th->dest,
+					iph->daddr, iph->saddr);
+		if (!req)
+			goto out;
+
+		/* ICMPs are not backlogged, hence we cannot get
+		   an established socket here.
+		 */
+		BUG_TRAP(!req->sk);
+
+		if (seq != req->snt_isn) {
+			NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
+			goto out;
+		}
+
+		/*
+		 * Still in SYN_RECV, just remove it silently.
+		 * There is no good way to pass the error to the newly
+		 * created socket, and POSIX does not want network
+		 * errors returned from accept().
+		 */
+		tcp_synq_drop(sk, req, prev);
+		goto out;
+
+	case TCP_SYN_SENT:
+	case TCP_SYN_RECV:  /* Cannot happen.
+			       It can f.e. if SYNs crossed.
+			     */
+		if (!sock_owned_by_user(sk)) {
+			TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+			sk->sk_err = err;
+
+			sk->sk_error_report(sk);
+
+			tcp_done(sk);
+		} else {
+			sk->sk_err_soft = err;
+		}
+		goto out;
+	}
+
+	/* If we've already connected we will keep trying
+	 * until we time out, or the user gives up.
+	 *
+	 * rfc1122 4.2.3.9 allows to consider as hard errors
+	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
+	 * but it is obsoleted by pmtu discovery).
+	 *
+	 * Note, that in modern internet, where routing is unreliable
+	 * and in each dark corner broken firewalls sit, sending random
+	 * errors ordered by their masters even this two messages finally lose
+	 * their original sense (even Linux sends invalid PORT_UNREACHs)
+	 *
+	 * Now we are in compliance with RFCs.
+	 *							--ANK (980905)
+	 */
+
+	inet = inet_sk(sk);
+	if (!sock_owned_by_user(sk) && inet->recverr) {
+		sk->sk_err = err;
+		sk->sk_error_report(sk);
+	} else	{ /* Only an error on timeout */
+		sk->sk_err_soft = err;
+	}
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
+		       struct sk_buff *skb)
+{
+	struct inet_sock *inet = inet_sk(sk);
+
+	if (skb->ip_summed == CHECKSUM_HW) {
+		th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
+		skb->csum = offsetof(struct tcphdr, check);
+	} else {
+		th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
+					 csum_partial((char *)th,
+						      th->doff << 2,
+						      skb->csum));
+	}
+}
+
+/*
+ *	This routine will send an RST to the other tcp.
+ *
+ *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
+ *		      for reset.
+ *	Answer: if a packet caused RST, it is not for a socket
+ *		existing in our system, if it is matched to a socket,
+ *		it is just duplicate segment or bug in other side's TCP.
+ *		So that we build reply only basing on parameters
+ *		arrived with segment.
+ *	Exception: precedence violation. We do not implement it in any case.
+ */
+
+static void tcp_v4_send_reset(struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	struct tcphdr rth;
+	struct ip_reply_arg arg;
+
+	/* Never send a reset in response to a reset. */
+	if (th->rst)
+		return;
+
+	if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
+		return;
+
+	/* Swap the send and the receive. */
+	memset(&rth, 0, sizeof(struct tcphdr));
+	rth.dest   = th->source;
+	rth.source = th->dest;
+	rth.doff   = sizeof(struct tcphdr) / 4;
+	rth.rst    = 1;
+
+	if (th->ack) {
+		rth.seq = th->ack_seq;
+	} else {
+		rth.ack = 1;
+		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
+				    skb->len - (th->doff << 2));
+	}
+
+	memset(&arg, 0, sizeof arg);
+	arg.iov[0].iov_base = (unsigned char *)&rth;
+	arg.iov[0].iov_len  = sizeof rth;
+	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+				      skb->nh.iph->saddr, /*XXX*/
+				      sizeof(struct tcphdr), IPPROTO_TCP, 0);
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
+
+	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+	TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
+}
+
+/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
+   outside socket context is ugly, certainly. What can I do?
+ */
+
+static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+			    u32 win, u32 ts)
+{
+	struct tcphdr *th = skb->h.th;
+	struct {
+		struct tcphdr th;
+		u32 tsopt[3];
+	} rep;
+	struct ip_reply_arg arg;
+
+	memset(&rep.th, 0, sizeof(struct tcphdr));
+	memset(&arg, 0, sizeof arg);
+
+	arg.iov[0].iov_base = (unsigned char *)&rep;
+	arg.iov[0].iov_len  = sizeof(rep.th);
+	if (ts) {
+		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+				     (TCPOPT_TIMESTAMP << 8) |
+				     TCPOLEN_TIMESTAMP);
+		rep.tsopt[1] = htonl(tcp_time_stamp);
+		rep.tsopt[2] = htonl(ts);
+		arg.iov[0].iov_len = sizeof(rep);
+	}
+
+	/* Swap the send and the receive. */
+	rep.th.dest    = th->source;
+	rep.th.source  = th->dest;
+	rep.th.doff    = arg.iov[0].iov_len / 4;
+	rep.th.seq     = htonl(seq);
+	rep.th.ack_seq = htonl(ack);
+	rep.th.ack     = 1;
+	rep.th.window  = htons(win);
+
+	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
+				      skb->nh.iph->saddr, /*XXX*/
+				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
+	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
+
+	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
+
+	TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
+}
+
+static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
+
+	tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
+			tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
+
+	tcp_tw_put(tw);
+}
+
+static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
+{
+	tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd,
+			req->ts_recent);
+}
+
+static struct dst_entry* tcp_v4_route_req(struct sock *sk,
+					  struct open_request *req)
+{
+	struct rtable *rt;
+	struct ip_options *opt = req->af.v4_req.opt;
+	struct flowi fl = { .oif = sk->sk_bound_dev_if,
+			    .nl_u = { .ip4_u =
+				      { .daddr = ((opt && opt->srr) ?
+						  opt->faddr :
+						  req->af.v4_req.rmt_addr),
+					.saddr = req->af.v4_req.loc_addr,
+					.tos = RT_CONN_FLAGS(sk) } },
+			    .proto = IPPROTO_TCP,
+			    .uli_u = { .ports =
+				       { .sport = inet_sk(sk)->sport,
+					 .dport = req->rmt_port } } };
+
+	if (ip_route_output_flow(&rt, &fl, sk, 0)) {
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
+		ip_rt_put(rt);
+		IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
+		return NULL;
+	}
+	return &rt->u.dst;
+}
+
+/*
+ *	Send a SYN-ACK after having received an ACK.
+ *	This still operates on a open_request only, not on a big
+ *	socket.
+ */
+static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
+			      struct dst_entry *dst)
+{
+	int err = -1;
+	struct sk_buff * skb;
+
+	/* First, grab a route. */
+	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+		goto out;
+
+	skb = tcp_make_synack(sk, dst, req);
+
+	if (skb) {
+		struct tcphdr *th = skb->h.th;
+
+		th->check = tcp_v4_check(th, skb->len,
+					 req->af.v4_req.loc_addr,
+					 req->af.v4_req.rmt_addr,
+					 csum_partial((char *)th, skb->len,
+						      skb->csum));
+
+		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
+					    req->af.v4_req.rmt_addr,
+					    req->af.v4_req.opt);
+		if (err == NET_XMIT_CN)
+			err = 0;
+	}
+
+out:
+	dst_release(dst);
+	return err;
+}
+
+/*
+ *	IPv4 open_request destructor.
+ */
+static void tcp_v4_or_free(struct open_request *req)
+{
+	if (req->af.v4_req.opt)
+		kfree(req->af.v4_req.opt);
+}
+
+static inline void syn_flood_warning(struct sk_buff *skb)
+{
+	static unsigned long warntime;
+
+	if (time_after(jiffies, (warntime + HZ * 60))) {
+		warntime = jiffies;
+		printk(KERN_INFO
+		       "possible SYN flooding on port %d. Sending cookies.\n",
+		       ntohs(skb->h.th->dest));
+	}
+}
+
+/*
+ * Save and compile IPv4 options into the open_request if needed.
+ */
+static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
+						     struct sk_buff *skb)
+{
+	struct ip_options *opt = &(IPCB(skb)->opt);
+	struct ip_options *dopt = NULL;
+
+	if (opt && opt->optlen) {
+		int opt_size = optlength(opt);
+		dopt = kmalloc(opt_size, GFP_ATOMIC);
+		if (dopt) {
+			if (ip_options_echo(dopt, skb)) {
+				kfree(dopt);
+				dopt = NULL;
+			}
+		}
+	}
+	return dopt;
+}
+
+/*
+ * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
+ * One SYN_RECV socket costs about 80bytes on a 32bit machine.
+ * It would be better to replace it with a global counter for all sockets
+ * but then some measure against one socket starving all other sockets
+ * would be needed.
+ *
+ * It was 128 by default. Experiments with real servers show, that
+ * it is absolutely not enough even at 100conn/sec. 256 cures most
+ * of problems. This value is adjusted to 128 for very small machines
+ * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * Further increasing requires to change hash table size.
+ */
+int sysctl_max_syn_backlog = 256;
+
+struct or_calltable or_ipv4 = {
+	.family		=	PF_INET,
+	.rtx_syn_ack	=	tcp_v4_send_synack,
+	.send_ack	=	tcp_v4_or_send_ack,
+	.destructor	=	tcp_v4_or_free,
+	.send_reset	=	tcp_v4_send_reset,
+};
+
+int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_options_received tmp_opt;
+	struct open_request *req;
+	__u32 saddr = skb->nh.iph->saddr;
+	__u32 daddr = skb->nh.iph->daddr;
+	__u32 isn = TCP_SKB_CB(skb)->when;
+	struct dst_entry *dst = NULL;
+#ifdef CONFIG_SYN_COOKIES
+	int want_cookie = 0;
+#else
+#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
+#endif
+
+	/* Never answer to SYNs send to broadcast or multicast */
+	if (((struct rtable *)skb->dst)->rt_flags &
+	    (RTCF_BROADCAST | RTCF_MULTICAST))
+		goto drop;
+
+	/* TW buckets are converted to open requests without
+	 * limitations, they conserve resources and peer is
+	 * evidently real one.
+	 */
+	if (tcp_synq_is_full(sk) && !isn) {
+#ifdef CONFIG_SYN_COOKIES
+		if (sysctl_tcp_syncookies) {
+			want_cookie = 1;
+		} else
+#endif
+		goto drop;
+	}
+
+	/* Accept backlog is full. If we have already queued enough
+	 * of warm entries in syn queue, drop request. It is better than
+	 * clogging syn queue with openreqs with exponentially increasing
+	 * timeout.
+	 */
+	if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
+		goto drop;
+
+	req = tcp_openreq_alloc();
+	if (!req)
+		goto drop;
+
+	tcp_clear_options(&tmp_opt);
+	tmp_opt.mss_clamp = 536;
+	tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
+
+	tcp_parse_options(skb, &tmp_opt, 0);
+
+	if (want_cookie) {
+		tcp_clear_options(&tmp_opt);
+		tmp_opt.saw_tstamp = 0;
+	}
+
+	if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
+		/* Some OSes (unknown ones, but I see them on web server, which
+		 * contains information interesting only for windows'
+		 * users) do not send their stamp in SYN. It is easy case.
+		 * We simply do not advertise TS support.
+		 */
+		tmp_opt.saw_tstamp = 0;
+		tmp_opt.tstamp_ok  = 0;
+	}
+	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
+
+	tcp_openreq_init(req, &tmp_opt, skb);
+
+	req->af.v4_req.loc_addr = daddr;
+	req->af.v4_req.rmt_addr = saddr;
+	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
+	req->class = &or_ipv4;
+	if (!want_cookie)
+		TCP_ECN_create_request(req, skb->h.th);
+
+	if (want_cookie) {
+#ifdef CONFIG_SYN_COOKIES
+		syn_flood_warning(skb);
+#endif
+		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
+	} else if (!isn) {
+		struct inet_peer *peer = NULL;
+
+		/* VJ's idea. We save last timestamp seen
+		 * from the destination in peer table, when entering
+		 * state TIME-WAIT, and check against it before
+		 * accepting new connection request.
+		 *
+		 * If "isn" is not zero, this request hit alive
+		 * timewait bucket, so that all the necessary checks
+		 * are made in the function processing timewait state.
+		 */
+		if (tmp_opt.saw_tstamp &&
+		    sysctl_tcp_tw_recycle &&
+		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
+		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
+		    peer->v4daddr == saddr) {
+			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+			    (s32)(peer->tcp_ts - req->ts_recent) >
+							TCP_PAWS_WINDOW) {
+				NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
+				dst_release(dst);
+				goto drop_and_free;
+			}
+		}
+		/* Kill the following clause, if you dislike this way. */
+		else if (!sysctl_tcp_syncookies &&
+			 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
+			  (sysctl_max_syn_backlog >> 2)) &&
+			 (!peer || !peer->tcp_ts_stamp) &&
+			 (!dst || !dst_metric(dst, RTAX_RTT))) {
+			/* Without syncookies last quarter of
+			 * backlog is filled with destinations,
+			 * proven to be alive.
+			 * It means that we continue to communicate
+			 * to destinations, already remembered
+			 * to the moment of synflood.
+			 */
+			NETDEBUG(if (net_ratelimit()) \
+					printk(KERN_DEBUG "TCP: drop open "
+							  "request from %u.%u."
+							  "%u.%u/%u\n", \
+					       NIPQUAD(saddr),
+					       ntohs(skb->h.th->source)));
+			dst_release(dst);
+			goto drop_and_free;
+		}
+
+		isn = tcp_v4_init_sequence(sk, skb);
+	}
+	req->snt_isn = isn;
+
+	if (tcp_v4_send_synack(sk, req, dst))
+		goto drop_and_free;
+
+	if (want_cookie) {
+	   	tcp_openreq_free(req);
+	} else {
+		tcp_v4_synq_add(sk, req);
+	}
+	return 0;
+
+drop_and_free:
+	tcp_openreq_free(req);
+drop:
+	TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
+	return 0;
+}
+
+
+/*
+ * The three way handshake has completed - we got a valid synack -
+ * now create the new socket.
+ */
+struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
+				  struct open_request *req,
+				  struct dst_entry *dst)
+{
+	struct inet_sock *newinet;
+	struct tcp_sock *newtp;
+	struct sock *newsk;
+
+	if (sk_acceptq_is_full(sk))
+		goto exit_overflow;
+
+	if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
+		goto exit;
+
+	newsk = tcp_create_openreq_child(sk, req, skb);
+	if (!newsk)
+		goto exit;
+
+	newsk->sk_dst_cache = dst;
+	tcp_v4_setup_caps(newsk, dst);
+
+	newtp		      = tcp_sk(newsk);
+	newinet		      = inet_sk(newsk);
+	newinet->daddr	      = req->af.v4_req.rmt_addr;
+	newinet->rcv_saddr    = req->af.v4_req.loc_addr;
+	newinet->saddr	      = req->af.v4_req.loc_addr;
+	newinet->opt	      = req->af.v4_req.opt;
+	req->af.v4_req.opt    = NULL;
+	newinet->mc_index     = tcp_v4_iif(skb);
+	newinet->mc_ttl	      = skb->nh.iph->ttl;
+	newtp->ext_header_len = 0;
+	if (newinet->opt)
+		newtp->ext_header_len = newinet->opt->optlen;
+	newinet->id = newtp->write_seq ^ jiffies;
+
+	tcp_sync_mss(newsk, dst_mtu(dst));
+	newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
+	tcp_initialize_rcv_mss(newsk);
+
+	__tcp_v4_hash(newsk, 0);
+	__tcp_inherit_port(sk, newsk);
+
+	return newsk;
+
+exit_overflow:
+	NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
+exit:
+	NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
+	dst_release(dst);
+	return NULL;
+}
+
+static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = skb->h.th;
+	struct iphdr *iph = skb->nh.iph;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sock *nsk;
+	struct open_request **prev;
+	/* Find possible connection requests. */
+	struct open_request *req = tcp_v4_search_req(tp, &prev, th->source,
+						     iph->saddr, iph->daddr);
+	if (req)
+		return tcp_check_req(sk, skb, req, prev);
+
+	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
+					  th->source,
+					  skb->nh.iph->daddr,
+					  ntohs(th->dest),
+					  tcp_v4_iif(skb));
+
+	if (nsk) {
+		if (nsk->sk_state != TCP_TIME_WAIT) {
+			bh_lock_sock(nsk);
+			return nsk;
+		}
+		tcp_tw_put((struct tcp_tw_bucket *)nsk);
+		return NULL;
+	}
+
+#ifdef CONFIG_SYN_COOKIES
+	if (!th->rst && !th->syn && th->ack)
+		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
+#endif
+	return sk;
+}
+
+static int tcp_v4_checksum_init(struct sk_buff *skb)
+{
+	if (skb->ip_summed == CHECKSUM_HW) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
+				  skb->nh.iph->daddr, skb->csum))
+			return 0;
+
+		NETDEBUG(if (net_ratelimit())
+				printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
+		skb->ip_summed = CHECKSUM_NONE;
+	}
+	if (skb->len <= 76) {
+		if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
+				 skb->nh.iph->daddr,
+				 skb_checksum(skb, 0, skb->len, 0)))
+			return -1;
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else {
+		skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
+					  skb->nh.iph->saddr,
+					  skb->nh.iph->daddr, 0);
+	}
+	return 0;
+}
+
+
+/* The socket must have it's spinlock held when we get
+ * here.
+ *
+ * We have a potential double-lock case here, so even when
+ * doing backlog processing we use the BH locking scheme.
+ * This is because we cannot sleep with the original spinlock
+ * held.
+ */
+int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
+		TCP_CHECK_TIMER(sk);
+		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
+			goto reset;
+		TCP_CHECK_TIMER(sk);
+		return 0;
+	}
+
+	if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
+		goto csum_err;
+
+	if (sk->sk_state == TCP_LISTEN) {
+		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
+		if (!nsk)
+			goto discard;
+
+		if (nsk != sk) {
+			if (tcp_child_process(sk, nsk, skb))
+				goto reset;
+			return 0;
+		}
+	}
+
+	TCP_CHECK_TIMER(sk);
+	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
+		goto reset;
+	TCP_CHECK_TIMER(sk);
+	return 0;
+
+reset:
+	tcp_v4_send_reset(skb);
+discard:
+	kfree_skb(skb);
+	/* Be careful here. If this function gets more complicated and
+	 * gcc suffers from register pressure on the x86, sk (in %ebx)
+	 * might be destroyed here. This current version compiles correctly,
+	 * but you have been warned.
+	 */
+	return 0;
+
+csum_err:
+	TCP_INC_STATS_BH(TCP_MIB_INERRS);
+	goto discard;
+}
+
+/*
+ *	From tcp_input.c
+ */
+
+int tcp_v4_rcv(struct sk_buff *skb)
+{
+	struct tcphdr *th;
+	struct sock *sk;
+	int ret;
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto discard_it;
+
+	/* Count it even if it's bad */
+	TCP_INC_STATS_BH(TCP_MIB_INSEGS);
+
+	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
+		goto discard_it;
+
+	th = skb->h.th;
+
+	if (th->doff < sizeof(struct tcphdr) / 4)
+		goto bad_packet;
+	if (!pskb_may_pull(skb, th->doff * 4))
+		goto discard_it;
+
+	/* An explanation is required here, I think.
+	 * Packet length and doff are validated by header prediction,
+	 * provided case of th->doff==0 is elimineted.
+	 * So, we defer the checks. */
+	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
+	     tcp_v4_checksum_init(skb) < 0))
+		goto bad_packet;
+
+	th = skb->h.th;
+	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
+	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
+				    skb->len - th->doff * 4);
+	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
+	TCP_SKB_CB(skb)->when	 = 0;
+	TCP_SKB_CB(skb)->flags	 = skb->nh.iph->tos;
+	TCP_SKB_CB(skb)->sacked	 = 0;
+
+	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
+			     skb->nh.iph->daddr, ntohs(th->dest),
+			     tcp_v4_iif(skb));
+
+	if (!sk)
+		goto no_tcp_socket;
+
+process:
+	if (sk->sk_state == TCP_TIME_WAIT)
+		goto do_time_wait;
+
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
+		goto discard_and_relse;
+
+	if (sk_filter(sk, skb, 0))
+		goto discard_and_relse;
+
+	skb->dev = NULL;
+
+	bh_lock_sock(sk);
+	ret = 0;
+	if (!sock_owned_by_user(sk)) {
+		if (!tcp_prequeue(sk, skb))
+			ret = tcp_v4_do_rcv(sk, skb);
+	} else
+		sk_add_backlog(sk, skb);
+	bh_unlock_sock(sk);
+
+	sock_put(sk);
+
+	return ret;
+
+no_tcp_socket:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto discard_it;
+
+	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+bad_packet:
+		TCP_INC_STATS_BH(TCP_MIB_INERRS);
+	} else {
+		tcp_v4_send_reset(skb);
+	}
+
+discard_it:
+	/* Discard frame. */
+	kfree_skb(skb);
+  	return 0;
+
+discard_and_relse:
+	sock_put(sk);
+	goto discard_it;
+
+do_time_wait:
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		goto discard_it;
+	}
+
+	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
+		TCP_INC_STATS_BH(TCP_MIB_INERRS);
+		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		goto discard_it;
+	}
+	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
+					   skb, th, skb->len)) {
+	case TCP_TW_SYN: {
+		struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr,
+							  ntohs(th->dest),
+							  tcp_v4_iif(skb));
+		if (sk2) {
+			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
+			tcp_tw_put((struct tcp_tw_bucket *)sk);
+			sk = sk2;
+			goto process;
+		}
+		/* Fall through to ACK */
+	}
+	case TCP_TW_ACK:
+		tcp_v4_timewait_ack(sk, skb);
+		break;
+	case TCP_TW_RST:
+		goto no_tcp_socket;
+	case TCP_TW_SUCCESS:;
+	}
+	goto discard_it;
+}
+
+/* With per-bucket locks this operation is not-atomic, so that
+ * this version is not worse.
+ */
+static void __tcp_v4_rehash(struct sock *sk)
+{
+	sk->sk_prot->unhash(sk);
+	sk->sk_prot->hash(sk);
+}
+
+static int tcp_v4_reselect_saddr(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	int err;
+	struct rtable *rt;
+	__u32 old_saddr = inet->saddr;
+	__u32 new_saddr;
+	__u32 daddr = inet->daddr;
+
+	if (inet->opt && inet->opt->srr)
+		daddr = inet->opt->faddr;
+
+	/* Query new route. */
+	err = ip_route_connect(&rt, daddr, 0,
+			       RT_CONN_FLAGS(sk),
+			       sk->sk_bound_dev_if,
+			       IPPROTO_TCP,
+			       inet->sport, inet->dport, sk);
+	if (err)
+		return err;
+
+	__sk_dst_set(sk, &rt->u.dst);
+	tcp_v4_setup_caps(sk, &rt->u.dst);
+
+	new_saddr = rt->rt_src;
+
+	if (new_saddr == old_saddr)
+		return 0;
+
+	if (sysctl_ip_dynaddr > 1) {
+		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->"
+				 "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
+		       NIPQUAD(old_saddr),
+		       NIPQUAD(new_saddr));
+	}
+
+	inet->saddr = new_saddr;
+	inet->rcv_saddr = new_saddr;
+
+	/* XXX The only one ugly spot where we need to
+	 * XXX really change the sockets identity after
+	 * XXX it has entered the hashes. -DaveM
+	 *
+	 * Besides that, it does not check for connection
+	 * uniqueness. Wait for troubles.
+	 */
+	__tcp_v4_rehash(sk);
+	return 0;
+}
+
+int tcp_v4_rebuild_header(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
+	u32 daddr;
+	int err;
+
+	/* Route is OK, nothing to do. */
+	if (rt)
+		return 0;
+
+	/* Reroute. */
+	daddr = inet->daddr;
+	if (inet->opt && inet->opt->srr)
+		daddr = inet->opt->faddr;
+
+	{
+		struct flowi fl = { .oif = sk->sk_bound_dev_if,
+				    .nl_u = { .ip4_u =
+					      { .daddr = daddr,
+						.saddr = inet->saddr,
+						.tos = RT_CONN_FLAGS(sk) } },
+				    .proto = IPPROTO_TCP,
+				    .uli_u = { .ports =
+					       { .sport = inet->sport,
+						 .dport = inet->dport } } };
+						
+		err = ip_route_output_flow(&rt, &fl, sk, 0);
+	}
+	if (!err) {
+		__sk_dst_set(sk, &rt->u.dst);
+		tcp_v4_setup_caps(sk, &rt->u.dst);
+		return 0;
+	}
+
+	/* Routing failed... */
+	sk->sk_route_caps = 0;
+
+	if (!sysctl_ip_dynaddr ||
+	    sk->sk_state != TCP_SYN_SENT ||
+	    (sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
+	    (err = tcp_v4_reselect_saddr(sk)) != 0)
+		sk->sk_err_soft = -err;
+
+	return err;
+}
+
+static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
+	struct inet_sock *inet = inet_sk(sk);
+
+	sin->sin_family		= AF_INET;
+	sin->sin_addr.s_addr	= inet->daddr;
+	sin->sin_port		= inet->dport;
+}
+
+/* VJ's idea. Save last timestamp seen from this destination
+ * and hold it at least for normal timewait interval to use for duplicate
+ * segment detection in subsequent connections, before they enter synchronized
+ * state.
+ */
+
+int tcp_v4_remember_stamp(struct sock *sk)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
+	struct inet_peer *peer = NULL;
+	int release_it = 0;
+
+	if (!rt || rt->rt_dst != inet->daddr) {
+		peer = inet_getpeer(inet->daddr, 1);
+		release_it = 1;
+	} else {
+		if (!rt->peer)
+			rt_bind_peer(rt, 1);
+		peer = rt->peer;
+	}
+
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
+		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+		     peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
+			peer->tcp_ts = tp->rx_opt.ts_recent;
+		}
+		if (release_it)
+			inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
+int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
+{
+	struct inet_peer *peer = NULL;
+
+	peer = inet_getpeer(tw->tw_daddr, 1);
+
+	if (peer) {
+		if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
+		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
+		     peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
+			peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
+			peer->tcp_ts = tw->tw_ts_recent;
+		}
+		inet_putpeer(peer);
+		return 1;
+	}
+
+	return 0;
+}
+
+struct tcp_func ipv4_specific = {
+	.queue_xmit	=	ip_queue_xmit,
+	.send_check	=	tcp_v4_send_check,
+	.rebuild_header	=	tcp_v4_rebuild_header,
+	.conn_request	=	tcp_v4_conn_request,
+	.syn_recv_sock	=	tcp_v4_syn_recv_sock,
+	.remember_stamp	=	tcp_v4_remember_stamp,
+	.net_header_len	=	sizeof(struct iphdr),
+	.setsockopt	=	ip_setsockopt,
+	.getsockopt	=	ip_getsockopt,
+	.addr2sockaddr	=	v4_addr2sockaddr,
+	.sockaddr_len	=	sizeof(struct sockaddr_in),
+};
+
+/* NOTE: A lot of things set to zero explicitly by call to
+ *       sk_alloc() so need not be done here.
+ */
+static int tcp_v4_init_sock(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	skb_queue_head_init(&tp->out_of_order_queue);
+	tcp_init_xmit_timers(sk);
+	tcp_prequeue_init(tp);
+
+	tp->rto  = TCP_TIMEOUT_INIT;
+	tp->mdev = TCP_TIMEOUT_INIT;
+
+	/* So many TCP implementations out there (incorrectly) count the
+	 * initial SYN frame in their delayed-ACK and congestion control
+	 * algorithms that we must have the following bandaid to talk
+	 * efficiently to them.  -DaveM
+	 */
+	tp->snd_cwnd = 2;
+
+	/* See draft-stevens-tcpca-spec-01 for discussion of the
+	 * initialization of these values.
+	 */
+	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
+	tp->snd_cwnd_clamp = ~0;
+	tp->mss_cache_std = tp->mss_cache = 536;
+
+	tp->reordering = sysctl_tcp_reordering;
+
+	sk->sk_state = TCP_CLOSE;
+
+	sk->sk_write_space = sk_stream_write_space;
+	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
+
+	tp->af_specific = &ipv4_specific;
+
+	sk->sk_sndbuf = sysctl_tcp_wmem[1];
+	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
+
+	atomic_inc(&tcp_sockets_allocated);
+
+	return 0;
+}
+
+int tcp_v4_destroy_sock(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_clear_xmit_timers(sk);
+
+	/* Cleanup up the write buffer. */
+  	sk_stream_writequeue_purge(sk);
+
+	/* Cleans up our, hopefully empty, out_of_order_queue. */
+  	__skb_queue_purge(&tp->out_of_order_queue);
+
+	/* Clean prequeue, it must be empty really */
+	__skb_queue_purge(&tp->ucopy.prequeue);
+
+	/* Clean up a referenced TCP bind bucket. */
+	if (tp->bind_hash)
+		tcp_put_port(sk);
+
+	/*
+	 * If sendmsg cached page exists, toss it.
+	 */
+	if (sk->sk_sndmsg_page) {
+		__free_page(sk->sk_sndmsg_page);
+		sk->sk_sndmsg_page = NULL;
+	}
+
+	atomic_dec(&tcp_sockets_allocated);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(tcp_v4_destroy_sock);
+
+#ifdef CONFIG_PROC_FS
+/* Proc filesystem TCP sock list dumping. */
+
+static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
+{
+	return hlist_empty(head) ? NULL :
+		list_entry(head->first, struct tcp_tw_bucket, tw_node);
+}
+
+static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
+{
+	return tw->tw_node.next ?
+		hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
+}
+
+static void *listening_get_next(struct seq_file *seq, void *cur)
+{
+	struct tcp_sock *tp;
+	struct hlist_node *node;
+	struct sock *sk = cur;
+	struct tcp_iter_state* st = seq->private;
+
+	if (!sk) {
+		st->bucket = 0;
+		sk = sk_head(&tcp_listening_hash[0]);
+		goto get_sk;
+	}
+
+	++st->num;
+
+	if (st->state == TCP_SEQ_STATE_OPENREQ) {
+		struct open_request *req = cur;
+
+	       	tp = tcp_sk(st->syn_wait_sk);
+		req = req->dl_next;
+		while (1) {
+			while (req) {
+				if (req->class->family == st->family) {
+					cur = req;
+					goto out;
+				}
+				req = req->dl_next;
+			}
+			if (++st->sbucket >= TCP_SYNQ_HSIZE)
+				break;
+get_req:
+			req = tp->listen_opt->syn_table[st->sbucket];
+		}
+		sk	  = sk_next(st->syn_wait_sk);
+		st->state = TCP_SEQ_STATE_LISTENING;
+		read_unlock_bh(&tp->syn_wait_lock);
+	} else {
+	       	tp = tcp_sk(sk);
+		read_lock_bh(&tp->syn_wait_lock);
+		if (tp->listen_opt && tp->listen_opt->qlen)
+			goto start_req;
+		read_unlock_bh(&tp->syn_wait_lock);
+		sk = sk_next(sk);
+	}
+get_sk:
+	sk_for_each_from(sk, node) {
+		if (sk->sk_family == st->family) {
+			cur = sk;
+			goto out;
+		}
+	       	tp = tcp_sk(sk);
+		read_lock_bh(&tp->syn_wait_lock);
+		if (tp->listen_opt && tp->listen_opt->qlen) {
+start_req:
+			st->uid		= sock_i_uid(sk);
+			st->syn_wait_sk = sk;
+			st->state	= TCP_SEQ_STATE_OPENREQ;
+			st->sbucket	= 0;
+			goto get_req;
+		}
+		read_unlock_bh(&tp->syn_wait_lock);
+	}
+	if (++st->bucket < TCP_LHTABLE_SIZE) {
+		sk = sk_head(&tcp_listening_hash[st->bucket]);
+		goto get_sk;
+	}
+	cur = NULL;
+out:
+	return cur;
+}
+
+static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
+{
+	void *rc = listening_get_next(seq, NULL);
+
+	while (rc && *pos) {
+		rc = listening_get_next(seq, rc);
+		--*pos;
+	}
+	return rc;
+}
+
+static void *established_get_first(struct seq_file *seq)
+{
+	struct tcp_iter_state* st = seq->private;
+	void *rc = NULL;
+
+	for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) {
+		struct sock *sk;
+		struct hlist_node *node;
+		struct tcp_tw_bucket *tw;
+
+		/* We can reschedule _before_ having picked the target: */
+		cond_resched_softirq();
+
+		read_lock(&tcp_ehash[st->bucket].lock);
+		sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) {
+			if (sk->sk_family != st->family) {
+				continue;
+			}
+			rc = sk;
+			goto out;
+		}
+		st->state = TCP_SEQ_STATE_TIME_WAIT;
+		tw_for_each(tw, node,
+			    &tcp_ehash[st->bucket + tcp_ehash_size].chain) {
+			if (tw->tw_family != st->family) {
+				continue;
+			}
+			rc = tw;
+			goto out;
+		}
+		read_unlock(&tcp_ehash[st->bucket].lock);
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+	}
+out:
+	return rc;
+}
+
+static void *established_get_next(struct seq_file *seq, void *cur)
+{
+	struct sock *sk = cur;
+	struct tcp_tw_bucket *tw;
+	struct hlist_node *node;
+	struct tcp_iter_state* st = seq->private;
+
+	++st->num;
+
+	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
+		tw = cur;
+		tw = tw_next(tw);
+get_tw:
+		while (tw && tw->tw_family != st->family) {
+			tw = tw_next(tw);
+		}
+		if (tw) {
+			cur = tw;
+			goto out;
+		}
+		read_unlock(&tcp_ehash[st->bucket].lock);
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+
+		/* We can reschedule between buckets: */
+		cond_resched_softirq();
+
+		if (++st->bucket < tcp_ehash_size) {
+			read_lock(&tcp_ehash[st->bucket].lock);
+			sk = sk_head(&tcp_ehash[st->bucket].chain);
+		} else {
+			cur = NULL;
+			goto out;
+		}
+	} else
+		sk = sk_next(sk);
+
+	sk_for_each_from(sk, node) {
+		if (sk->sk_family == st->family)
+			goto found;
+	}
+
+	st->state = TCP_SEQ_STATE_TIME_WAIT;
+	tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain);
+	goto get_tw;
+found:
+	cur = sk;
+out:
+	return cur;
+}
+
+static void *established_get_idx(struct seq_file *seq, loff_t pos)
+{
+	void *rc = established_get_first(seq);
+
+	while (rc && pos) {
+		rc = established_get_next(seq, rc);
+		--pos;
+	}		
+	return rc;
+}
+
+static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	void *rc;
+	struct tcp_iter_state* st = seq->private;
+
+	tcp_listen_lock();
+	st->state = TCP_SEQ_STATE_LISTENING;
+	rc	  = listening_get_idx(seq, &pos);
+
+	if (!rc) {
+		tcp_listen_unlock();
+		local_bh_disable();
+		st->state = TCP_SEQ_STATE_ESTABLISHED;
+		rc	  = established_get_idx(seq, pos);
+	}
+
+	return rc;
+}
+
+static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct tcp_iter_state* st = seq->private;
+	st->state = TCP_SEQ_STATE_LISTENING;
+	st->num = 0;
+	return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
+}
+
+static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	void *rc = NULL;
+	struct tcp_iter_state* st;
+
+	if (v == SEQ_START_TOKEN) {
+		rc = tcp_get_idx(seq, 0);
+		goto out;
+	}
+	st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_OPENREQ:
+	case TCP_SEQ_STATE_LISTENING:
+		rc = listening_get_next(seq, v);
+		if (!rc) {
+			tcp_listen_unlock();
+			local_bh_disable();
+			st->state = TCP_SEQ_STATE_ESTABLISHED;
+			rc	  = established_get_first(seq);
+		}
+		break;
+	case TCP_SEQ_STATE_ESTABLISHED:
+	case TCP_SEQ_STATE_TIME_WAIT:
+		rc = established_get_next(seq, v);
+		break;
+	}
+out:
+	++*pos;
+	return rc;
+}
+
+static void tcp_seq_stop(struct seq_file *seq, void *v)
+{
+	struct tcp_iter_state* st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_OPENREQ:
+		if (v) {
+			struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
+			read_unlock_bh(&tp->syn_wait_lock);
+		}
+	case TCP_SEQ_STATE_LISTENING:
+		if (v != SEQ_START_TOKEN)
+			tcp_listen_unlock();
+		break;
+	case TCP_SEQ_STATE_TIME_WAIT:
+	case TCP_SEQ_STATE_ESTABLISHED:
+		if (v)
+			read_unlock(&tcp_ehash[st->bucket].lock);
+		local_bh_enable();
+		break;
+	}
+}
+
+static int tcp_seq_open(struct inode *inode, struct file *file)
+{
+	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct seq_file *seq;
+	struct tcp_iter_state *s;
+	int rc;
+
+	if (unlikely(afinfo == NULL))
+		return -EINVAL;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+	memset(s, 0, sizeof(*s));
+	s->family		= afinfo->family;
+	s->seq_ops.start	= tcp_seq_start;
+	s->seq_ops.next		= tcp_seq_next;
+	s->seq_ops.show		= afinfo->seq_show;
+	s->seq_ops.stop		= tcp_seq_stop;
+
+	rc = seq_open(file, &s->seq_ops);
+	if (rc)
+		goto out_kfree;
+	seq	     = file->private_data;
+	seq->private = s;
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
+{
+	int rc = 0;
+	struct proc_dir_entry *p;
+
+	if (!afinfo)
+		return -EINVAL;
+	afinfo->seq_fops->owner		= afinfo->owner;
+	afinfo->seq_fops->open		= tcp_seq_open;
+	afinfo->seq_fops->read		= seq_read;
+	afinfo->seq_fops->llseek	= seq_lseek;
+	afinfo->seq_fops->release	= seq_release_private;
+	
+	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+	if (p)
+		p->data = afinfo;
+	else
+		rc = -ENOMEM;
+	return rc;
+}
+
+void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
+{
+	if (!afinfo)
+		return;
+	proc_net_remove(afinfo->name);
+	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); 
+}
+
+static void get_openreq4(struct sock *sk, struct open_request *req,
+			 char *tmpbuf, int i, int uid)
+{
+	int ttd = req->expires - jiffies;
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
+		i,
+		req->af.v4_req.loc_addr,
+		ntohs(inet_sk(sk)->sport),
+		req->af.v4_req.rmt_addr,
+		ntohs(req->rmt_port),
+		TCP_SYN_RECV,
+		0, 0, /* could print option size, but that is af dependent. */
+		1,    /* timers active (only the expire timer) */
+		jiffies_to_clock_t(ttd),
+		req->retrans,
+		uid,
+		0,  /* non standard timer */
+		0, /* open_requests have no inode */
+		atomic_read(&sk->sk_refcnt),
+		req);
+}
+
+static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
+{
+	int timer_active;
+	unsigned long timer_expires;
+	struct tcp_sock *tp = tcp_sk(sp);
+	struct inet_sock *inet = inet_sk(sp);
+	unsigned int dest = inet->daddr;
+	unsigned int src = inet->rcv_saddr;
+	__u16 destp = ntohs(inet->dport);
+	__u16 srcp = ntohs(inet->sport);
+
+	if (tp->pending == TCP_TIME_RETRANS) {
+		timer_active	= 1;
+		timer_expires	= tp->timeout;
+	} else if (tp->pending == TCP_TIME_PROBE0) {
+		timer_active	= 4;
+		timer_expires	= tp->timeout;
+	} else if (timer_pending(&sp->sk_timer)) {
+		timer_active	= 2;
+		timer_expires	= sp->sk_timer.expires;
+	} else {
+		timer_active	= 0;
+		timer_expires = jiffies;
+	}
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
+			"%08X %5d %8d %lu %d %p %u %u %u %u %d",
+		i, src, srcp, dest, destp, sp->sk_state,
+		tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
+		timer_active,
+		jiffies_to_clock_t(timer_expires - jiffies),
+		tp->retransmits,
+		sock_i_uid(sp),
+		tp->probes_out,
+		sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp,
+		tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
+		tp->snd_cwnd,
+		tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
+}
+
+static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
+{
+	unsigned int dest, src;
+	__u16 destp, srcp;
+	int ttd = tw->tw_ttd - jiffies;
+
+	if (ttd < 0)
+		ttd = 0;
+
+	dest  = tw->tw_daddr;
+	src   = tw->tw_rcv_saddr;
+	destp = ntohs(tw->tw_dport);
+	srcp  = ntohs(tw->tw_sport);
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
+		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
+		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
+		atomic_read(&tw->tw_refcnt), tw);
+}
+
+#define TMPSZ 150
+
+static int tcp4_seq_show(struct seq_file *seq, void *v)
+{
+	struct tcp_iter_state* st;
+	char tmpbuf[TMPSZ + 1];
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(seq, "%-*s\n", TMPSZ - 1,
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode");
+		goto out;
+	}
+	st = seq->private;
+
+	switch (st->state) {
+	case TCP_SEQ_STATE_LISTENING:
+	case TCP_SEQ_STATE_ESTABLISHED:
+		get_tcp4_sock(v, tmpbuf, st->num);
+		break;
+	case TCP_SEQ_STATE_OPENREQ:
+		get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
+		break;
+	case TCP_SEQ_STATE_TIME_WAIT:
+		get_timewait4_sock(v, tmpbuf, st->num);
+		break;
+	}
+	seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
+out:
+	return 0;
+}
+
+static struct file_operations tcp4_seq_fops;
+static struct tcp_seq_afinfo tcp4_seq_afinfo = {
+	.owner		= THIS_MODULE,
+	.name		= "tcp",
+	.family		= AF_INET,
+	.seq_show	= tcp4_seq_show,
+	.seq_fops	= &tcp4_seq_fops,
+};
+
+int __init tcp4_proc_init(void)
+{
+	return tcp_proc_register(&tcp4_seq_afinfo);
+}
+
+void tcp4_proc_exit(void)
+{
+	tcp_proc_unregister(&tcp4_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+
+struct proto tcp_prot = {
+	.name			= "TCP",
+	.owner			= THIS_MODULE,
+	.close			= tcp_close,
+	.connect		= tcp_v4_connect,
+	.disconnect		= tcp_disconnect,
+	.accept			= tcp_accept,
+	.ioctl			= tcp_ioctl,
+	.init			= tcp_v4_init_sock,
+	.destroy		= tcp_v4_destroy_sock,
+	.shutdown		= tcp_shutdown,
+	.setsockopt		= tcp_setsockopt,
+	.getsockopt		= tcp_getsockopt,
+	.sendmsg		= tcp_sendmsg,
+	.recvmsg		= tcp_recvmsg,
+	.backlog_rcv		= tcp_v4_do_rcv,
+	.hash			= tcp_v4_hash,
+	.unhash			= tcp_unhash,
+	.get_port		= tcp_v4_get_port,
+	.enter_memory_pressure	= tcp_enter_memory_pressure,
+	.sockets_allocated	= &tcp_sockets_allocated,
+	.memory_allocated	= &tcp_memory_allocated,
+	.memory_pressure	= &tcp_memory_pressure,
+	.sysctl_mem		= sysctl_tcp_mem,
+	.sysctl_wmem		= sysctl_tcp_wmem,
+	.sysctl_rmem		= sysctl_tcp_rmem,
+	.max_header		= MAX_TCP_HEADER,
+	.obj_size		= sizeof(struct tcp_sock),
+};
+
+
+
+void __init tcp_v4_init(struct net_proto_family *ops)
+{
+	int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
+	if (err < 0)
+		panic("Failed to create the TCP control socket.\n");
+	tcp_socket->sk->sk_allocation   = GFP_ATOMIC;
+	inet_sk(tcp_socket->sk)->uc_ttl = -1;
+
+	/* Unhash it so that IP input processing does not even
+	 * see it, we do not wish this socket to see incoming
+	 * packets.
+	 */
+	tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
+}
+
+EXPORT_SYMBOL(ipv4_specific);
+EXPORT_SYMBOL(tcp_bind_hash);
+EXPORT_SYMBOL(tcp_bucket_create);
+EXPORT_SYMBOL(tcp_hashinfo);
+EXPORT_SYMBOL(tcp_inherit_port);
+EXPORT_SYMBOL(tcp_listen_wlock);
+EXPORT_SYMBOL(tcp_port_rover);
+EXPORT_SYMBOL(tcp_prot);
+EXPORT_SYMBOL(tcp_put_port);
+EXPORT_SYMBOL(tcp_unhash);
+EXPORT_SYMBOL(tcp_v4_conn_request);
+EXPORT_SYMBOL(tcp_v4_connect);
+EXPORT_SYMBOL(tcp_v4_do_rcv);
+EXPORT_SYMBOL(tcp_v4_rebuild_header);
+EXPORT_SYMBOL(tcp_v4_remember_stamp);
+EXPORT_SYMBOL(tcp_v4_send_check);
+EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
+
+#ifdef CONFIG_PROC_FS
+EXPORT_SYMBOL(tcp_proc_register);
+EXPORT_SYMBOL(tcp_proc_unregister);
+#endif
+EXPORT_SYMBOL(sysctl_local_port_range);
+EXPORT_SYMBOL(sysctl_max_syn_backlog);
+EXPORT_SYMBOL(sysctl_tcp_low_latency);
+EXPORT_SYMBOL(sysctl_tcp_tw_reuse);
+
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
new file mode 100644
index 000000000000..fd70509f0d53
--- /dev/null
+++ b/net/ipv4/tcp_minisocks.c
@@ -0,0 +1,1077 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/workqueue.h>
+#include <net/tcp.h>
+#include <net/inet_common.h>
+#include <net/xfrm.h>
+
+#ifdef CONFIG_SYSCTL
+#define SYNC_INIT 0 /* let the user enable it */
+#else
+#define SYNC_INIT 1
+#endif
+
+int sysctl_tcp_tw_recycle;
+int sysctl_tcp_max_tw_buckets = NR_FILE*2;
+
+int sysctl_tcp_syncookies = SYNC_INIT; 
+int sysctl_tcp_abort_on_overflow;
+
+static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
+
+static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
+{
+	if (seq == s_win)
+		return 1;
+	if (after(end_seq, s_win) && before(seq, e_win))
+		return 1;
+	return (seq == e_win && seq == end_seq);
+}
+
+/* New-style handling of TIME_WAIT sockets. */
+
+int tcp_tw_count;
+
+
+/* Must be called with locally disabled BHs. */
+static void tcp_timewait_kill(struct tcp_tw_bucket *tw)
+{
+	struct tcp_ehash_bucket *ehead;
+	struct tcp_bind_hashbucket *bhead;
+	struct tcp_bind_bucket *tb;
+
+	/* Unlink from established hashes. */
+	ehead = &tcp_ehash[tw->tw_hashent];
+	write_lock(&ehead->lock);
+	if (hlist_unhashed(&tw->tw_node)) {
+		write_unlock(&ehead->lock);
+		return;
+	}
+	__hlist_del(&tw->tw_node);
+	sk_node_init(&tw->tw_node);
+	write_unlock(&ehead->lock);
+
+	/* Disassociate with bind bucket. */
+	bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)];
+	spin_lock(&bhead->lock);
+	tb = tw->tw_tb;
+	__hlist_del(&tw->tw_bind_node);
+	tw->tw_tb = NULL;
+	tcp_bucket_destroy(tb);
+	spin_unlock(&bhead->lock);
+
+#ifdef INET_REFCNT_DEBUG
+	if (atomic_read(&tw->tw_refcnt) != 1) {
+		printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw,
+		       atomic_read(&tw->tw_refcnt));
+	}
+#endif
+	tcp_tw_put(tw);
+}
+
+/* 
+ * * Main purpose of TIME-WAIT state is to close connection gracefully,
+ *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
+ *   (and, probably, tail of data) and one or more our ACKs are lost.
+ * * What is TIME-WAIT timeout? It is associated with maximal packet
+ *   lifetime in the internet, which results in wrong conclusion, that
+ *   it is set to catch "old duplicate segments" wandering out of their path.
+ *   It is not quite correct. This timeout is calculated so that it exceeds
+ *   maximal retransmission timeout enough to allow to lose one (or more)
+ *   segments sent by peer and our ACKs. This time may be calculated from RTO.
+ * * When TIME-WAIT socket receives RST, it means that another end
+ *   finally closed and we are allowed to kill TIME-WAIT too.
+ * * Second purpose of TIME-WAIT is catching old duplicate segments.
+ *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
+ *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
+ * * If we invented some more clever way to catch duplicates
+ *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
+ *
+ * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
+ * When you compare it to RFCs, please, read section SEGMENT ARRIVES
+ * from the very beginning.
+ *
+ * NOTE. With recycling (and later with fin-wait-2) TW bucket
+ * is _not_ stateless. It means, that strictly speaking we must
+ * spinlock it. I do not want! Well, probability of misbehaviour
+ * is ridiculously low and, seems, we could use some mb() tricks
+ * to avoid misread sequence numbers, states etc.  --ANK
+ */
+enum tcp_tw_status
+tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb,
+			   struct tcphdr *th, unsigned len)
+{
+	struct tcp_options_received tmp_opt;
+	int paws_reject = 0;
+
+	tmp_opt.saw_tstamp = 0;
+	if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) {
+		tcp_parse_options(skb, &tmp_opt, 0);
+
+		if (tmp_opt.saw_tstamp) {
+			tmp_opt.ts_recent	   = tw->tw_ts_recent;
+			tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
+			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+		}
+	}
+
+	if (tw->tw_substate == TCP_FIN_WAIT2) {
+		/* Just repeat all the checks of tcp_rcv_state_process() */
+
+		/* Out of window, send ACK */
+		if (paws_reject ||
+		    !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+				   tw->tw_rcv_nxt,
+				   tw->tw_rcv_nxt + tw->tw_rcv_wnd))
+			return TCP_TW_ACK;
+
+		if (th->rst)
+			goto kill;
+
+		if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt))
+			goto kill_with_rst;
+
+		/* Dup ACK? */
+		if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) ||
+		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
+			tcp_tw_put(tw);
+			return TCP_TW_SUCCESS;
+		}
+
+		/* New data or FIN. If new data arrive after half-duplex close,
+		 * reset.
+		 */
+		if (!th->fin ||
+		    TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) {
+kill_with_rst:
+			tcp_tw_deschedule(tw);
+			tcp_tw_put(tw);
+			return TCP_TW_RST;
+		}
+
+		/* FIN arrived, enter true time-wait state. */
+		tw->tw_substate	= TCP_TIME_WAIT;
+		tw->tw_rcv_nxt	= TCP_SKB_CB(skb)->end_seq;
+		if (tmp_opt.saw_tstamp) {
+			tw->tw_ts_recent_stamp	= xtime.tv_sec;
+			tw->tw_ts_recent	= tmp_opt.rcv_tsval;
+		}
+
+		/* I am shamed, but failed to make it more elegant.
+		 * Yes, it is direct reference to IP, which is impossible
+		 * to generalize to IPv6. Taking into account that IPv6
+		 * do not undertsnad recycling in any case, it not
+		 * a big problem in practice. --ANK */
+		if (tw->tw_family == AF_INET &&
+		    sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp &&
+		    tcp_v4_tw_remember_stamp(tw))
+			tcp_tw_schedule(tw, tw->tw_timeout);
+		else
+			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+		return TCP_TW_ACK;
+	}
+
+	/*
+	 *	Now real TIME-WAIT state.
+	 *
+	 *	RFC 1122:
+	 *	"When a connection is [...] on TIME-WAIT state [...]
+	 *	[a TCP] MAY accept a new SYN from the remote TCP to
+	 *	reopen the connection directly, if it:
+	 *	
+	 *	(1)  assigns its initial sequence number for the new
+	 *	connection to be larger than the largest sequence
+	 *	number it used on the previous connection incarnation,
+	 *	and
+	 *
+	 *	(2)  returns to TIME-WAIT state if the SYN turns out 
+	 *	to be an old duplicate".
+	 */
+
+	if (!paws_reject &&
+	    (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt &&
+	     (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
+		/* In window segment, it may be only reset or bare ack. */
+
+		if (th->rst) {
+			/* This is TIME_WAIT assasination, in two flavors.
+			 * Oh well... nobody has a sufficient solution to this
+			 * protocol bug yet.
+			 */
+			if (sysctl_tcp_rfc1337 == 0) {
+kill:
+				tcp_tw_deschedule(tw);
+				tcp_tw_put(tw);
+				return TCP_TW_SUCCESS;
+			}
+		}
+		tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+
+		if (tmp_opt.saw_tstamp) {
+			tw->tw_ts_recent	= tmp_opt.rcv_tsval;
+			tw->tw_ts_recent_stamp	= xtime.tv_sec;
+		}
+
+		tcp_tw_put(tw);
+		return TCP_TW_SUCCESS;
+	}
+
+	/* Out of window segment.
+
+	   All the segments are ACKed immediately.
+
+	   The only exception is new SYN. We accept it, if it is
+	   not old duplicate and we are not in danger to be killed
+	   by delayed old duplicates. RFC check is that it has
+	   newer sequence number works at rates <40Mbit/sec.
+	   However, if paws works, it is reliable AND even more,
+	   we even may relax silly seq space cutoff.
+
+	   RED-PEN: we violate main RFC requirement, if this SYN will appear
+	   old duplicate (i.e. we receive RST in reply to SYN-ACK),
+	   we must return socket to time-wait state. It is not good,
+	   but not fatal yet.
+	 */
+
+	if (th->syn && !th->rst && !th->ack && !paws_reject &&
+	    (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) ||
+	     (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
+		u32 isn = tw->tw_snd_nxt + 65535 + 2;
+		if (isn == 0)
+			isn++;
+		TCP_SKB_CB(skb)->when = isn;
+		return TCP_TW_SYN;
+	}
+
+	if (paws_reject)
+		NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+
+	if(!th->rst) {
+		/* In this case we must reset the TIMEWAIT timer.
+		 *
+		 * If it is ACKless SYN it may be both old duplicate
+		 * and new good SYN with random sequence number <rcv_nxt.
+		 * Do not reschedule in the last case.
+		 */
+		if (paws_reject || th->ack)
+			tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
+
+		/* Send ACK. Note, we do not put the bucket,
+		 * it will be released by caller.
+		 */
+		return TCP_TW_ACK;
+	}
+	tcp_tw_put(tw);
+	return TCP_TW_SUCCESS;
+}
+
+/* Enter the time wait state.  This is called with locally disabled BH.
+ * Essentially we whip up a timewait bucket, copy the
+ * relevant info into it from the SK, and mess with hash chains
+ * and list linkage.
+ */
+static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw)
+{
+	struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent];
+	struct tcp_bind_hashbucket *bhead;
+
+	/* Step 1: Put TW into bind hash. Original socket stays there too.
+	   Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in
+	   binding cache, even if it is closed.
+	 */
+	bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)];
+	spin_lock(&bhead->lock);
+	tw->tw_tb = tcp_sk(sk)->bind_hash;
+	BUG_TRAP(tcp_sk(sk)->bind_hash);
+	tw_add_bind_node(tw, &tw->tw_tb->owners);
+	spin_unlock(&bhead->lock);
+
+	write_lock(&ehead->lock);
+
+	/* Step 2: Remove SK from established hash. */
+	if (__sk_del_node_init(sk))
+		sock_prot_dec_use(sk->sk_prot);
+
+	/* Step 3: Hash TW into TIMEWAIT half of established hash table. */
+	tw_add_node(tw, &(ehead + tcp_ehash_size)->chain);
+	atomic_inc(&tw->tw_refcnt);
+
+	write_unlock(&ehead->lock);
+}
+
+/* 
+ * Move a socket to time-wait or dead fin-wait-2 state.
+ */ 
+void tcp_time_wait(struct sock *sk, int state, int timeo)
+{
+	struct tcp_tw_bucket *tw = NULL;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int recycle_ok = 0;
+
+	if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
+		recycle_ok = tp->af_specific->remember_stamp(sk);
+
+	if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
+		tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC);
+
+	if(tw != NULL) {
+		struct inet_sock *inet = inet_sk(sk);
+		int rto = (tp->rto<<2) - (tp->rto>>1);
+
+		/* Give us an identity. */
+		tw->tw_daddr		= inet->daddr;
+		tw->tw_rcv_saddr	= inet->rcv_saddr;
+		tw->tw_bound_dev_if	= sk->sk_bound_dev_if;
+		tw->tw_num		= inet->num;
+		tw->tw_state		= TCP_TIME_WAIT;
+		tw->tw_substate		= state;
+		tw->tw_sport		= inet->sport;
+		tw->tw_dport		= inet->dport;
+		tw->tw_family		= sk->sk_family;
+		tw->tw_reuse		= sk->sk_reuse;
+		tw->tw_rcv_wscale	= tp->rx_opt.rcv_wscale;
+		atomic_set(&tw->tw_refcnt, 1);
+
+		tw->tw_hashent		= sk->sk_hashent;
+		tw->tw_rcv_nxt		= tp->rcv_nxt;
+		tw->tw_snd_nxt		= tp->snd_nxt;
+		tw->tw_rcv_wnd		= tcp_receive_window(tp);
+		tw->tw_ts_recent	= tp->rx_opt.ts_recent;
+		tw->tw_ts_recent_stamp	= tp->rx_opt.ts_recent_stamp;
+		tw_dead_node_init(tw);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (tw->tw_family == PF_INET6) {
+			struct ipv6_pinfo *np = inet6_sk(sk);
+
+			ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr);
+			ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr);
+			tw->tw_v6_ipv6only = np->ipv6only;
+		} else {
+			memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr));
+			memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr));
+			tw->tw_v6_ipv6only = 0;
+		}
+#endif
+		/* Linkage updates. */
+		__tcp_tw_hashdance(sk, tw);
+
+		/* Get the TIME_WAIT timeout firing. */
+		if (timeo < rto)
+			timeo = rto;
+
+		if (recycle_ok) {
+			tw->tw_timeout = rto;
+		} else {
+			tw->tw_timeout = TCP_TIMEWAIT_LEN;
+			if (state == TCP_TIME_WAIT)
+				timeo = TCP_TIMEWAIT_LEN;
+		}
+
+		tcp_tw_schedule(tw, timeo);
+		tcp_tw_put(tw);
+	} else {
+		/* Sorry, if we're out of memory, just CLOSE this
+		 * socket up.  We've got bigger problems than
+		 * non-graceful socket closings.
+		 */
+		if (net_ratelimit())
+			printk(KERN_INFO "TCP: time wait bucket table overflow\n");
+	}
+
+	tcp_update_metrics(sk);
+	tcp_done(sk);
+}
+
+/* Kill off TIME_WAIT sockets once their lifetime has expired. */
+static int tcp_tw_death_row_slot;
+
+static void tcp_twkill(unsigned long);
+
+/* TIME_WAIT reaping mechanism. */
+#define TCP_TWKILL_SLOTS	8	/* Please keep this a power of 2. */
+#define TCP_TWKILL_PERIOD	(TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
+
+#define TCP_TWKILL_QUOTA	100
+
+static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
+static DEFINE_SPINLOCK(tw_death_lock);
+static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
+static void twkill_work(void *);
+static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
+static u32 twkill_thread_slots;
+
+/* Returns non-zero if quota exceeded.  */
+static int tcp_do_twkill_work(int slot, unsigned int quota)
+{
+	struct tcp_tw_bucket *tw;
+	struct hlist_node *node;
+	unsigned int killed;
+	int ret;
+
+	/* NOTE: compare this to previous version where lock
+	 * was released after detaching chain. It was racy,
+	 * because tw buckets are scheduled in not serialized context
+	 * in 2.3 (with netfilter), and with softnet it is common, because
+	 * soft irqs are not sequenced.
+	 */
+	killed = 0;
+	ret = 0;
+rescan:
+	tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
+		__tw_del_dead_node(tw);
+		spin_unlock(&tw_death_lock);
+		tcp_timewait_kill(tw);
+		tcp_tw_put(tw);
+		killed++;
+		spin_lock(&tw_death_lock);
+		if (killed > quota) {
+			ret = 1;
+			break;
+		}
+
+		/* While we dropped tw_death_lock, another cpu may have
+		 * killed off the next TW bucket in the list, therefore
+		 * do a fresh re-read of the hlist head node with the
+		 * lock reacquired.  We still use the hlist traversal
+		 * macro in order to get the prefetches.
+		 */
+		goto rescan;
+	}
+
+	tcp_tw_count -= killed;
+	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
+
+	return ret;
+}
+
+static void tcp_twkill(unsigned long dummy)
+{
+	int need_timer, ret;
+
+	spin_lock(&tw_death_lock);
+
+	if (tcp_tw_count == 0)
+		goto out;
+
+	need_timer = 0;
+	ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
+	if (ret) {
+		twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
+		mb();
+		schedule_work(&tcp_twkill_work);
+		need_timer = 1;
+	} else {
+		/* We purged the entire slot, anything left?  */
+		if (tcp_tw_count)
+			need_timer = 1;
+	}
+	tcp_tw_death_row_slot =
+		((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
+	if (need_timer)
+		mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
+out:
+	spin_unlock(&tw_death_lock);
+}
+
+extern void twkill_slots_invalid(void);
+
+static void twkill_work(void *dummy)
+{
+	int i;
+
+	if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
+		twkill_slots_invalid();
+
+	while (twkill_thread_slots) {
+		spin_lock_bh(&tw_death_lock);
+		for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
+			if (!(twkill_thread_slots & (1 << i)))
+				continue;
+
+			while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
+				if (need_resched()) {
+					spin_unlock_bh(&tw_death_lock);
+					schedule();
+					spin_lock_bh(&tw_death_lock);
+				}
+			}
+
+			twkill_thread_slots &= ~(1 << i);
+		}
+		spin_unlock_bh(&tw_death_lock);
+	}
+}
+
+/* These are always called from BH context.  See callers in
+ * tcp_input.c to verify this.
+ */
+
+/* This is for handling early-kills of TIME_WAIT sockets. */
+void tcp_tw_deschedule(struct tcp_tw_bucket *tw)
+{
+	spin_lock(&tw_death_lock);
+	if (tw_del_dead_node(tw)) {
+		tcp_tw_put(tw);
+		if (--tcp_tw_count == 0)
+			del_timer(&tcp_tw_timer);
+	}
+	spin_unlock(&tw_death_lock);
+	tcp_timewait_kill(tw);
+}
+
+/* Short-time timewait calendar */
+
+static int tcp_twcal_hand = -1;
+static int tcp_twcal_jiffie;
+static void tcp_twcal_tick(unsigned long);
+static struct timer_list tcp_twcal_timer =
+		TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
+static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
+
+static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo)
+{
+	struct hlist_head *list;
+	int slot;
+
+	/* timeout := RTO * 3.5
+	 *
+	 * 3.5 = 1+2+0.5 to wait for two retransmits.
+	 *
+	 * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
+	 * our ACK acking that FIN can be lost. If N subsequent retransmitted
+	 * FINs (or previous seqments) are lost (probability of such event
+	 * is p^(N+1), where p is probability to lose single packet and
+	 * time to detect the loss is about RTO*(2^N - 1) with exponential
+	 * backoff). Normal timewait length is calculated so, that we
+	 * waited at least for one retransmitted FIN (maximal RTO is 120sec).
+	 * [ BTW Linux. following BSD, violates this requirement waiting
+	 *   only for 60sec, we should wait at least for 240 secs.
+	 *   Well, 240 consumes too much of resources 8)
+	 * ]
+	 * This interval is not reduced to catch old duplicate and
+	 * responces to our wandering segments living for two MSLs.
+	 * However, if we use PAWS to detect
+	 * old duplicates, we can reduce the interval to bounds required
+	 * by RTO, rather than MSL. So, if peer understands PAWS, we
+	 * kill tw bucket after 3.5*RTO (it is important that this number
+	 * is greater than TS tick!) and detect old duplicates with help
+	 * of PAWS.
+	 */
+	slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
+
+	spin_lock(&tw_death_lock);
+
+	/* Unlink it, if it was scheduled */
+	if (tw_del_dead_node(tw))
+		tcp_tw_count--;
+	else
+		atomic_inc(&tw->tw_refcnt);
+
+	if (slot >= TCP_TW_RECYCLE_SLOTS) {
+		/* Schedule to slow timer */
+		if (timeo >= TCP_TIMEWAIT_LEN) {
+			slot = TCP_TWKILL_SLOTS-1;
+		} else {
+			slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
+			if (slot >= TCP_TWKILL_SLOTS)
+				slot = TCP_TWKILL_SLOTS-1;
+		}
+		tw->tw_ttd = jiffies + timeo;
+		slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
+		list = &tcp_tw_death_row[slot];
+	} else {
+		tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
+
+		if (tcp_twcal_hand < 0) {
+			tcp_twcal_hand = 0;
+			tcp_twcal_jiffie = jiffies;
+			tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
+			add_timer(&tcp_twcal_timer);
+		} else {
+			if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
+				mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
+			slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
+		}
+		list = &tcp_twcal_row[slot];
+	}
+
+	hlist_add_head(&tw->tw_death_node, list);
+
+	if (tcp_tw_count++ == 0)
+		mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
+	spin_unlock(&tw_death_lock);
+}
+
+void tcp_twcal_tick(unsigned long dummy)
+{
+	int n, slot;
+	unsigned long j;
+	unsigned long now = jiffies;
+	int killed = 0;
+	int adv = 0;
+
+	spin_lock(&tw_death_lock);
+	if (tcp_twcal_hand < 0)
+		goto out;
+
+	slot = tcp_twcal_hand;
+	j = tcp_twcal_jiffie;
+
+	for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
+		if (time_before_eq(j, now)) {
+			struct hlist_node *node, *safe;
+			struct tcp_tw_bucket *tw;
+
+			tw_for_each_inmate_safe(tw, node, safe,
+					   &tcp_twcal_row[slot]) {
+				__tw_del_dead_node(tw);
+				tcp_timewait_kill(tw);
+				tcp_tw_put(tw);
+				killed++;
+			}
+		} else {
+			if (!adv) {
+				adv = 1;
+				tcp_twcal_jiffie = j;
+				tcp_twcal_hand = slot;
+			}
+
+			if (!hlist_empty(&tcp_twcal_row[slot])) {
+				mod_timer(&tcp_twcal_timer, j);
+				goto out;
+			}
+		}
+		j += (1<<TCP_TW_RECYCLE_TICK);
+		slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
+	}
+	tcp_twcal_hand = -1;
+
+out:
+	if ((tcp_tw_count -= killed) == 0)
+		del_timer(&tcp_tw_timer);
+	NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
+	spin_unlock(&tw_death_lock);
+}
+
+/* This is not only more efficient than what we used to do, it eliminates
+ * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
+ *
+ * Actually, we could lots of memory writes here. tp of listening
+ * socket contains all necessary default parameters.
+ */
+struct sock *tcp_create_openreq_child(struct sock *sk, struct open_request *req, struct sk_buff *skb)
+{
+	/* allocate the newsk from the same slab of the master sock,
+	 * if not, at sk_free time we'll try to free it from the wrong
+	 * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
+	struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
+
+	if(newsk != NULL) {
+		struct tcp_sock *newtp;
+		struct sk_filter *filter;
+
+		memcpy(newsk, sk, sizeof(struct tcp_sock));
+		newsk->sk_state = TCP_SYN_RECV;
+
+		/* SANITY */
+		sk_node_init(&newsk->sk_node);
+		tcp_sk(newsk)->bind_hash = NULL;
+
+		/* Clone the TCP header template */
+		inet_sk(newsk)->dport = req->rmt_port;
+
+		sock_lock_init(newsk);
+		bh_lock_sock(newsk);
+
+		rwlock_init(&newsk->sk_dst_lock);
+		atomic_set(&newsk->sk_rmem_alloc, 0);
+		skb_queue_head_init(&newsk->sk_receive_queue);
+		atomic_set(&newsk->sk_wmem_alloc, 0);
+		skb_queue_head_init(&newsk->sk_write_queue);
+		atomic_set(&newsk->sk_omem_alloc, 0);
+		newsk->sk_wmem_queued = 0;
+		newsk->sk_forward_alloc = 0;
+
+		sock_reset_flag(newsk, SOCK_DONE);
+		newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+		newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
+		newsk->sk_send_head = NULL;
+		rwlock_init(&newsk->sk_callback_lock);
+		skb_queue_head_init(&newsk->sk_error_queue);
+		newsk->sk_write_space = sk_stream_write_space;
+
+		if ((filter = newsk->sk_filter) != NULL)
+			sk_filter_charge(newsk, filter);
+
+		if (unlikely(xfrm_sk_clone_policy(newsk))) {
+			/* It is still raw copy of parent, so invalidate
+			 * destructor and make plain sk_free() */
+			newsk->sk_destruct = NULL;
+			sk_free(newsk);
+			return NULL;
+		}
+
+		/* Now setup tcp_sock */
+		newtp = tcp_sk(newsk);
+		newtp->pred_flags = 0;
+		newtp->rcv_nxt = req->rcv_isn + 1;
+		newtp->snd_nxt = req->snt_isn + 1;
+		newtp->snd_una = req->snt_isn + 1;
+		newtp->snd_sml = req->snt_isn + 1;
+
+		tcp_prequeue_init(newtp);
+
+		tcp_init_wl(newtp, req->snt_isn, req->rcv_isn);
+
+		newtp->retransmits = 0;
+		newtp->backoff = 0;
+		newtp->srtt = 0;
+		newtp->mdev = TCP_TIMEOUT_INIT;
+		newtp->rto = TCP_TIMEOUT_INIT;
+
+		newtp->packets_out = 0;
+		newtp->left_out = 0;
+		newtp->retrans_out = 0;
+		newtp->sacked_out = 0;
+		newtp->fackets_out = 0;
+		newtp->snd_ssthresh = 0x7fffffff;
+
+		/* So many TCP implementations out there (incorrectly) count the
+		 * initial SYN frame in their delayed-ACK and congestion control
+		 * algorithms that we must have the following bandaid to talk
+		 * efficiently to them.  -DaveM
+		 */
+		newtp->snd_cwnd = 2;
+		newtp->snd_cwnd_cnt = 0;
+
+		newtp->frto_counter = 0;
+		newtp->frto_highmark = 0;
+
+		tcp_set_ca_state(newtp, TCP_CA_Open);
+		tcp_init_xmit_timers(newsk);
+		skb_queue_head_init(&newtp->out_of_order_queue);
+		newtp->rcv_wup = req->rcv_isn + 1;
+		newtp->write_seq = req->snt_isn + 1;
+		newtp->pushed_seq = newtp->write_seq;
+		newtp->copied_seq = req->rcv_isn + 1;
+
+		newtp->rx_opt.saw_tstamp = 0;
+
+		newtp->rx_opt.dsack = 0;
+		newtp->rx_opt.eff_sacks = 0;
+
+		newtp->probes_out = 0;
+		newtp->rx_opt.num_sacks = 0;
+		newtp->urg_data = 0;
+		newtp->listen_opt = NULL;
+		newtp->accept_queue = newtp->accept_queue_tail = NULL;
+		/* Deinitialize syn_wait_lock to trap illegal accesses. */
+		memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock));
+
+		/* Back to base struct sock members. */
+		newsk->sk_err = 0;
+		newsk->sk_priority = 0;
+		atomic_set(&newsk->sk_refcnt, 2);
+#ifdef INET_REFCNT_DEBUG
+		atomic_inc(&inet_sock_nr);
+#endif
+		atomic_inc(&tcp_sockets_allocated);
+
+		if (sock_flag(newsk, SOCK_KEEPOPEN))
+			tcp_reset_keepalive_timer(newsk,
+						  keepalive_time_when(newtp));
+		newsk->sk_socket = NULL;
+		newsk->sk_sleep = NULL;
+
+		newtp->rx_opt.tstamp_ok = req->tstamp_ok;
+		if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) {
+			if (sysctl_tcp_fack)
+				newtp->rx_opt.sack_ok |= 2;
+		}
+		newtp->window_clamp = req->window_clamp;
+		newtp->rcv_ssthresh = req->rcv_wnd;
+		newtp->rcv_wnd = req->rcv_wnd;
+		newtp->rx_opt.wscale_ok = req->wscale_ok;
+		if (newtp->rx_opt.wscale_ok) {
+			newtp->rx_opt.snd_wscale = req->snd_wscale;
+			newtp->rx_opt.rcv_wscale = req->rcv_wscale;
+		} else {
+			newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
+			newtp->window_clamp = min(newtp->window_clamp, 65535U);
+		}
+		newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
+		newtp->max_window = newtp->snd_wnd;
+
+		if (newtp->rx_opt.tstamp_ok) {
+			newtp->rx_opt.ts_recent = req->ts_recent;
+			newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+			newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
+		} else {
+			newtp->rx_opt.ts_recent_stamp = 0;
+			newtp->tcp_header_len = sizeof(struct tcphdr);
+		}
+		if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
+			newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
+		newtp->rx_opt.mss_clamp = req->mss;
+		TCP_ECN_openreq_child(newtp, req);
+		if (newtp->ecn_flags&TCP_ECN_OK)
+			sock_set_flag(newsk, SOCK_NO_LARGESEND);
+
+		tcp_ca_init(newtp);
+
+		TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
+	}
+	return newsk;
+}
+
+/* 
+ *	Process an incoming packet for SYN_RECV sockets represented
+ *	as an open_request.
+ */
+
+struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
+			   struct open_request *req,
+			   struct open_request **prev)
+{
+	struct tcphdr *th = skb->h.th;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
+	int paws_reject = 0;
+	struct tcp_options_received tmp_opt;
+	struct sock *child;
+
+	tmp_opt.saw_tstamp = 0;
+	if (th->doff > (sizeof(struct tcphdr)>>2)) {
+		tcp_parse_options(skb, &tmp_opt, 0);
+
+		if (tmp_opt.saw_tstamp) {
+			tmp_opt.ts_recent = req->ts_recent;
+			/* We do not store true stamp, but it is not required,
+			 * it can be estimated (approximately)
+			 * from another data.
+			 */
+			tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
+			paws_reject = tcp_paws_check(&tmp_opt, th->rst);
+		}
+	}
+
+	/* Check for pure retransmitted SYN. */
+	if (TCP_SKB_CB(skb)->seq == req->rcv_isn &&
+	    flg == TCP_FLAG_SYN &&
+	    !paws_reject) {
+		/*
+		 * RFC793 draws (Incorrectly! It was fixed in RFC1122)
+		 * this case on figure 6 and figure 8, but formal
+		 * protocol description says NOTHING.
+		 * To be more exact, it says that we should send ACK,
+		 * because this segment (at least, if it has no data)
+		 * is out of window.
+		 *
+		 *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
+		 *  describe SYN-RECV state. All the description
+		 *  is wrong, we cannot believe to it and should
+		 *  rely only on common sense and implementation
+		 *  experience.
+		 *
+		 * Enforce "SYN-ACK" according to figure 8, figure 6
+		 * of RFC793, fixed by RFC1122.
+		 */
+		req->class->rtx_syn_ack(sk, req, NULL);
+		return NULL;
+	}
+
+	/* Further reproduces section "SEGMENT ARRIVES"
+	   for state SYN-RECEIVED of RFC793.
+	   It is broken, however, it does not work only
+	   when SYNs are crossed.
+
+	   You would think that SYN crossing is impossible here, since
+	   we should have a SYN_SENT socket (from connect()) on our end,
+	   but this is not true if the crossed SYNs were sent to both
+	   ends by a malicious third party.  We must defend against this,
+	   and to do that we first verify the ACK (as per RFC793, page
+	   36) and reset if it is invalid.  Is this a true full defense?
+	   To convince ourselves, let us consider a way in which the ACK
+	   test can still pass in this 'malicious crossed SYNs' case.
+	   Malicious sender sends identical SYNs (and thus identical sequence
+	   numbers) to both A and B:
+
+		A: gets SYN, seq=7
+		B: gets SYN, seq=7
+
+	   By our good fortune, both A and B select the same initial
+	   send sequence number of seven :-)
+
+		A: sends SYN|ACK, seq=7, ack_seq=8
+		B: sends SYN|ACK, seq=7, ack_seq=8
+
+	   So we are now A eating this SYN|ACK, ACK test passes.  So
+	   does sequence test, SYN is truncated, and thus we consider
+	   it a bare ACK.
+
+	   If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
+	   we create an established connection.  Both ends (listening sockets)
+	   accept the new incoming connection and try to talk to each other. 8-)
+
+	   Note: This case is both harmless, and rare.  Possibility is about the
+	   same as us discovering intelligent life on another plant tomorrow.
+
+	   But generally, we should (RFC lies!) to accept ACK
+	   from SYNACK both here and in tcp_rcv_state_process().
+	   tcp_rcv_state_process() does not, hence, we do not too.
+
+	   Note that the case is absolutely generic:
+	   we cannot optimize anything here without
+	   violating protocol. All the checks must be made
+	   before attempt to create socket.
+	 */
+
+	/* RFC793 page 36: "If the connection is in any non-synchronized state ...
+	 *                  and the incoming segment acknowledges something not yet
+	 *                  sent (the segment carries an unaccaptable ACK) ...
+	 *                  a reset is sent."
+	 *
+	 * Invalid ACK: reset will be sent by listening socket
+	 */
+	if ((flg & TCP_FLAG_ACK) &&
+	    (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1))
+		return sk;
+
+	/* Also, it would be not so bad idea to check rcv_tsecr, which
+	 * is essentially ACK extension and too early or too late values
+	 * should cause reset in unsynchronized states.
+	 */
+
+	/* RFC793: "first check sequence number". */
+
+	if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+					  req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) {
+		/* Out of window: send ACK and drop. */
+		if (!(flg & TCP_FLAG_RST))
+			req->class->send_ack(skb, req);
+		if (paws_reject)
+			NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
+		return NULL;
+	}
+
+	/* In sequence, PAWS is OK. */
+
+	if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1))
+			req->ts_recent = tmp_opt.rcv_tsval;
+
+		if (TCP_SKB_CB(skb)->seq == req->rcv_isn) {
+			/* Truncate SYN, it is out of window starting
+			   at req->rcv_isn+1. */
+			flg &= ~TCP_FLAG_SYN;
+		}
+
+		/* RFC793: "second check the RST bit" and
+		 *	   "fourth, check the SYN bit"
+		 */
+		if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
+			goto embryonic_reset;
+
+		/* ACK sequence verified above, just make sure ACK is
+		 * set.  If ACK not set, just silently drop the packet.
+		 */
+		if (!(flg & TCP_FLAG_ACK))
+			return NULL;
+
+		/* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
+		if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) {
+			req->acked = 1;
+			return NULL;
+		}
+
+		/* OK, ACK is valid, create big socket and
+		 * feed this segment to it. It will repeat all
+		 * the tests. THIS SEGMENT MUST MOVE SOCKET TO
+		 * ESTABLISHED STATE. If it will be dropped after
+		 * socket is created, wait for troubles.
+		 */
+		child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
+		if (child == NULL)
+			goto listen_overflow;
+
+		tcp_synq_unlink(tp, req, prev);
+		tcp_synq_removed(sk, req);
+
+		tcp_acceptq_queue(sk, req, child);
+		return child;
+
+	listen_overflow:
+		if (!sysctl_tcp_abort_on_overflow) {
+			req->acked = 1;
+			return NULL;
+		}
+
+	embryonic_reset:
+		NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
+		if (!(flg & TCP_FLAG_RST))
+			req->class->send_reset(skb);
+
+		tcp_synq_drop(sk, req, prev);
+		return NULL;
+}
+
+/*
+ * Queue segment on the new socket if the new socket is active,
+ * otherwise we just shortcircuit this and continue with
+ * the new socket.
+ */
+
+int tcp_child_process(struct sock *parent, struct sock *child,
+		      struct sk_buff *skb)
+{
+	int ret = 0;
+	int state = child->sk_state;
+
+	if (!sock_owned_by_user(child)) {
+		ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
+
+		/* Wakeup parent, send SIGIO */
+		if (state == TCP_SYN_RECV && child->sk_state != state)
+			parent->sk_data_ready(parent, 0);
+	} else {
+		/* Alas, it is possible again, because we do lookup
+		 * in main socket hash table and lock on listening
+		 * socket does not protect us more.
+		 */
+		sk_add_backlog(child, skb);
+	}
+
+	bh_unlock_sock(child);
+	sock_put(child);
+	return ret;
+}
+
+EXPORT_SYMBOL(tcp_check_req);
+EXPORT_SYMBOL(tcp_child_process);
+EXPORT_SYMBOL(tcp_create_openreq_child);
+EXPORT_SYMBOL(tcp_timewait_state_process);
+EXPORT_SYMBOL(tcp_tw_deschedule);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
new file mode 100644
index 000000000000..13c14cb6dee4
--- /dev/null
+++ b/net/ipv4/tcp_output.c
@@ -0,0 +1,1739 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+/*
+ * Changes:	Pedro Roque	:	Retransmit queue handled by TCP.
+ *				:	Fragmentation on mtu decrease
+ *				:	Segment collapse on retransmit
+ *				:	AF independence
+ *
+ *		Linus Torvalds	:	send_delayed_ack
+ *		David S. Miller	:	Charge memory using the right skb
+ *					during syn/ack processing.
+ *		David S. Miller :	Output engine completely rewritten.
+ *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr.
+ *		Cacophonix Gaul :	draft-minshall-nagle-01
+ *		J Hadi Salim	:	ECN support
+ *
+ */
+
+#include <net/tcp.h>
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/smp_lock.h>
+
+/* People can turn this off for buggy TCP's found in printers etc. */
+int sysctl_tcp_retrans_collapse = 1;
+
+/* This limits the percentage of the congestion window which we
+ * will allow a single TSO frame to consume.  Building TSO frames
+ * which are too large can cause TCP streams to be bursty.
+ */
+int sysctl_tcp_tso_win_divisor = 8;
+
+static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
+				    struct sk_buff *skb)
+{
+	sk->sk_send_head = skb->next;
+	if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
+		sk->sk_send_head = NULL;
+	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+	tcp_packets_out_inc(sk, tp, skb);
+}
+
+/* SND.NXT, if window was not shrunk.
+ * If window has been shrunk, what should we make? It is not clear at all.
+ * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-(
+ * Anything in between SND.UNA...SND.UNA+SND.WND also can be already
+ * invalid. OK, let's make this for now:
+ */
+static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp)
+{
+	if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))
+		return tp->snd_nxt;
+	else
+		return tp->snd_una+tp->snd_wnd;
+}
+
+/* Calculate mss to advertise in SYN segment.
+ * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that:
+ *
+ * 1. It is independent of path mtu.
+ * 2. Ideally, it is maximal possible segment size i.e. 65535-40.
+ * 3. For IPv4 it is reasonable to calculate it from maximal MTU of
+ *    attached devices, because some buggy hosts are confused by
+ *    large MSS.
+ * 4. We do not make 3, we advertise MSS, calculated from first
+ *    hop device mtu, but allow to raise it to ip_rt_min_advmss.
+ *    This may be overridden via information stored in routing table.
+ * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible,
+ *    probably even Jumbo".
+ */
+static __u16 tcp_advertise_mss(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	int mss = tp->advmss;
+
+	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {
+		mss = dst_metric(dst, RTAX_ADVMSS);
+		tp->advmss = mss;
+	}
+
+	return (__u16)mss;
+}
+
+/* RFC2861. Reset CWND after idle period longer RTO to "restart window".
+ * This is the first part of cwnd validation mechanism. */
+static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+{
+	s32 delta = tcp_time_stamp - tp->lsndtime;
+	u32 restart_cwnd = tcp_init_cwnd(tp, dst);
+	u32 cwnd = tp->snd_cwnd;
+
+	if (tcp_is_vegas(tp)) 
+		tcp_vegas_enable(tp);
+
+	tp->snd_ssthresh = tcp_current_ssthresh(tp);
+	restart_cwnd = min(restart_cwnd, cwnd);
+
+	while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+		cwnd >>= 1;
+	tp->snd_cwnd = max(cwnd, restart_cwnd);
+	tp->snd_cwnd_stamp = tcp_time_stamp;
+	tp->snd_cwnd_used = 0;
+}
+
+static inline void tcp_event_data_sent(struct tcp_sock *tp,
+				       struct sk_buff *skb, struct sock *sk)
+{
+	u32 now = tcp_time_stamp;
+
+	if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
+		tcp_cwnd_restart(tp, __sk_dst_get(sk));
+
+	tp->lsndtime = now;
+
+	/* If it is a reply for ato after last received
+	 * packet, enter pingpong mode.
+	 */
+	if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
+		tp->ack.pingpong = 1;
+}
+
+static __inline__ void tcp_event_ack_sent(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tcp_dec_quickack_mode(tp);
+	tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+}
+
+/* Determine a window scaling and initial window to offer.
+ * Based on the assumption that the given amount of space
+ * will be offered. Store the results in the tp structure.
+ * NOTE: for smooth operation initial space offering should
+ * be a multiple of mss if possible. We assume here that mss >= 1.
+ * This MUST be enforced by all callers.
+ */
+void tcp_select_initial_window(int __space, __u32 mss,
+			       __u32 *rcv_wnd, __u32 *window_clamp,
+			       int wscale_ok, __u8 *rcv_wscale)
+{
+	unsigned int space = (__space < 0 ? 0 : __space);
+
+	/* If no clamp set the clamp to the max possible scaled window */
+	if (*window_clamp == 0)
+		(*window_clamp) = (65535 << 14);
+	space = min(*window_clamp, space);
+
+	/* Quantize space offering to a multiple of mss if possible. */
+	if (space > mss)
+		space = (space / mss) * mss;
+
+	/* NOTE: offering an initial window larger than 32767
+	 * will break some buggy TCP stacks. We try to be nice.
+	 * If we are not window scaling, then this truncates
+	 * our initial window offering to 32k. There should also
+	 * be a sysctl option to stop being nice.
+	 */
+	(*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+	(*rcv_wscale) = 0;
+	if (wscale_ok) {
+		/* Set window scaling on max possible window
+		 * See RFC1323 for an explanation of the limit to 14 
+		 */
+		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+		while (space > 65535 && (*rcv_wscale) < 14) {
+			space >>= 1;
+			(*rcv_wscale)++;
+		}
+	}
+
+	/* Set initial window to value enough for senders,
+	 * following RFC1414. Senders, not following this RFC,
+	 * will be satisfied with 2.
+	 */
+	if (mss > (1<<*rcv_wscale)) {
+		int init_cwnd = 4;
+		if (mss > 1460*3)
+			init_cwnd = 2;
+		else if (mss > 1460)
+			init_cwnd = 3;
+		if (*rcv_wnd > init_cwnd*mss)
+			*rcv_wnd = init_cwnd*mss;
+	}
+
+	/* Set the clamp no higher than max representable value */
+	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);
+}
+
+/* Chose a new window to advertise, update state in tcp_sock for the
+ * socket, and return result with RFC1323 scaling applied.  The return
+ * value can be stuffed directly into th->window for an outgoing
+ * frame.
+ */
+static __inline__ u16 tcp_select_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 cur_win = tcp_receive_window(tp);
+	u32 new_win = __tcp_select_window(sk);
+
+	/* Never shrink the offered window */
+	if(new_win < cur_win) {
+		/* Danger Will Robinson!
+		 * Don't update rcv_wup/rcv_wnd here or else
+		 * we will not be able to advertise a zero
+		 * window in time.  --DaveM
+		 *
+		 * Relax Will Robinson.
+		 */
+		new_win = cur_win;
+	}
+	tp->rcv_wnd = new_win;
+	tp->rcv_wup = tp->rcv_nxt;
+
+	/* Make sure we do not exceed the maximum possible
+	 * scaled window.
+	 */
+	if (!tp->rx_opt.rcv_wscale)
+		new_win = min(new_win, MAX_TCP_WINDOW);
+	else
+		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
+
+	/* RFC1323 scaling applied */
+	new_win >>= tp->rx_opt.rcv_wscale;
+
+	/* If we advertise zero window, disable fast path. */
+	if (new_win == 0)
+		tp->pred_flags = 0;
+
+	return new_win;
+}
+
+
+/* This routine actually transmits TCP packets queued in by
+ * tcp_do_sendmsg().  This is used by both the initial
+ * transmission and possible later retransmissions.
+ * All SKB's seen here are completely headerless.  It is our
+ * job to build the TCP header, and pass the packet down to
+ * IP so it can do the same plus pass the packet off to the
+ * device.
+ *
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb != NULL) {
+		struct inet_sock *inet = inet_sk(sk);
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
+		int tcp_header_size = tp->tcp_header_len;
+		struct tcphdr *th;
+		int sysctl_flags;
+		int err;
+
+		BUG_ON(!tcp_skb_pcount(skb));
+
+#define SYSCTL_FLAG_TSTAMPS	0x1
+#define SYSCTL_FLAG_WSCALE	0x2
+#define SYSCTL_FLAG_SACK	0x4
+
+		sysctl_flags = 0;
+		if (tcb->flags & TCPCB_FLAG_SYN) {
+			tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+			if(sysctl_tcp_timestamps) {
+				tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+				sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
+			}
+			if(sysctl_tcp_window_scaling) {
+				tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+				sysctl_flags |= SYSCTL_FLAG_WSCALE;
+			}
+			if(sysctl_tcp_sack) {
+				sysctl_flags |= SYSCTL_FLAG_SACK;
+				if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+					tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
+			}
+		} else if (tp->rx_opt.eff_sacks) {
+			/* A SACK is 2 pad bytes, a 2 byte header, plus
+			 * 2 32-bit sequence numbers for each SACK block.
+			 */
+			tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+					    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+		}
+		
+		/*
+		 * If the connection is idle and we are restarting,
+		 * then we don't want to do any Vegas calculations
+		 * until we get fresh RTT samples.  So when we
+		 * restart, we reset our Vegas state to a clean
+		 * slate. After we get acks for this flight of
+		 * packets, _then_ we can make Vegas calculations
+		 * again.
+		 */
+		if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0)
+			tcp_vegas_enable(tp);
+
+		th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+		skb->h.th = th;
+		skb_set_owner_w(skb, sk);
+
+		/* Build TCP header and checksum it. */
+		th->source		= inet->sport;
+		th->dest		= inet->dport;
+		th->seq			= htonl(tcb->seq);
+		th->ack_seq		= htonl(tp->rcv_nxt);
+		*(((__u16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) | tcb->flags);
+		if (tcb->flags & TCPCB_FLAG_SYN) {
+			/* RFC1323: The window in SYN & SYN/ACK segments
+			 * is never scaled.
+			 */
+			th->window	= htons(tp->rcv_wnd);
+		} else {
+			th->window	= htons(tcp_select_window(sk));
+		}
+		th->check		= 0;
+		th->urg_ptr		= 0;
+
+		if (tp->urg_mode &&
+		    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
+			th->urg_ptr		= htons(tp->snd_up-tcb->seq);
+			th->urg			= 1;
+		}
+
+		if (tcb->flags & TCPCB_FLAG_SYN) {
+			tcp_syn_build_options((__u32 *)(th + 1),
+					      tcp_advertise_mss(sk),
+					      (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+					      (sysctl_flags & SYSCTL_FLAG_SACK),
+					      (sysctl_flags & SYSCTL_FLAG_WSCALE),
+					      tp->rx_opt.rcv_wscale,
+					      tcb->when,
+		      			      tp->rx_opt.ts_recent);
+		} else {
+			tcp_build_and_update_options((__u32 *)(th + 1),
+						     tp, tcb->when);
+
+			TCP_ECN_send(sk, tp, skb, tcp_header_size);
+		}
+		tp->af_specific->send_check(sk, th, skb->len, skb);
+
+		if (tcb->flags & TCPCB_FLAG_ACK)
+			tcp_event_ack_sent(sk);
+
+		if (skb->len != tcp_header_size)
+			tcp_event_data_sent(tp, skb, sk);
+
+		TCP_INC_STATS(TCP_MIB_OUTSEGS);
+
+		err = tp->af_specific->queue_xmit(skb, 0);
+		if (err <= 0)
+			return err;
+
+		tcp_enter_cwr(tp);
+
+		/* NET_XMIT_CN is special. It does not guarantee,
+		 * that this packet is lost. It tells that device
+		 * is about to start to drop packets or already
+		 * drops some packets of the same priority and
+		 * invokes us to send less aggressively.
+		 */
+		return err == NET_XMIT_CN ? 0 : err;
+	}
+	return -ENOBUFS;
+#undef SYSCTL_FLAG_TSTAMPS
+#undef SYSCTL_FLAG_WSCALE
+#undef SYSCTL_FLAG_SACK
+}
+
+
+/* This routine just queue's the buffer 
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+ * otherwise socket can stall.
+ */
+static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	/* Advance write_seq and place onto the write_queue. */
+	tp->write_seq = TCP_SKB_CB(skb)->end_seq;
+	skb_header_release(skb);
+	__skb_queue_tail(&sk->sk_write_queue, skb);
+	sk_charge_skb(sk, skb);
+
+	/* Queue it, remembering where we must start sending. */
+	if (sk->sk_send_head == NULL)
+		sk->sk_send_head = skb;
+}
+
+static inline void tcp_tso_set_push(struct sk_buff *skb)
+{
+	/* Force push to be on for any TSO frames to workaround
+	 * problems with busted implementations like Mac OS-X that
+	 * hold off socket receive wakeups until push is seen.
+	 */
+	if (tcp_skb_pcount(skb) > 1)
+		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+}
+
+/* Send _single_ skb sitting at the send head. This function requires
+ * true push pending frames to setup probe timer etc.
+ */
+void tcp_push_one(struct sock *sk, unsigned cur_mss)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb = sk->sk_send_head;
+
+	if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) {
+		/* Send it out now. */
+		TCP_SKB_CB(skb)->when = tcp_time_stamp;
+		tcp_tso_set_push(skb);
+		if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) {
+			sk->sk_send_head = NULL;
+			tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
+			tcp_packets_out_inc(sk, tp, skb);
+			return;
+		}
+	}
+}
+
+void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std)
+{
+	if (skb->len <= mss_std) {
+		/* Avoid the costly divide in the normal
+		 * non-TSO case.
+		 */
+		skb_shinfo(skb)->tso_segs = 1;
+		skb_shinfo(skb)->tso_size = 0;
+	} else {
+		unsigned int factor;
+
+		factor = skb->len + (mss_std - 1);
+		factor /= mss_std;
+		skb_shinfo(skb)->tso_segs = factor;
+		skb_shinfo(skb)->tso_size = mss_std;
+	}
+}
+
+/* Function to create two new TCP segments.  Shrinks the given segment
+ * to the specified size and appends a new segment with the rest of the
+ * packet to the list.  This won't be called frequently, I hope. 
+ * Remember, these are still headerless SKBs at this point.
+ */
+static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *buff;
+	int nsize;
+	u16 flags;
+
+	nsize = skb_headlen(skb) - len;
+	if (nsize < 0)
+		nsize = 0;
+
+	if (skb_cloned(skb) &&
+	    skb_is_nonlinear(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		return -ENOMEM;
+
+	/* Get a new skb... force flag on. */
+	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
+	if (buff == NULL)
+		return -ENOMEM; /* We'll just try again later. */
+	sk_charge_skb(sk, buff);
+
+	/* Correct the sequence numbers. */
+	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
+	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;
+
+	/* PSH and FIN should only be set in the second packet. */
+	flags = TCP_SKB_CB(skb)->flags;
+	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+	TCP_SKB_CB(buff)->flags = flags;
+	TCP_SKB_CB(buff)->sacked =
+		(TCP_SKB_CB(skb)->sacked &
+		 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
+	TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
+
+	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
+		/* Copy and checksum data tail into the new buffer. */
+		buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
+						       nsize, 0);
+
+		skb_trim(skb, len);
+
+		skb->csum = csum_block_sub(skb->csum, buff->csum, len);
+	} else {
+		skb->ip_summed = CHECKSUM_HW;
+		skb_split(skb, buff, len);
+	}
+
+	buff->ip_summed = skb->ip_summed;
+
+	/* Looks stupid, but our code really uses when of
+	 * skbs, which it never sent before. --ANK
+	 */
+	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
+
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+		tp->lost_out -= tcp_skb_pcount(skb);
+		tp->left_out -= tcp_skb_pcount(skb);
+	}
+
+	/* Fix up tso_factor for both original and new SKB.  */
+	tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+	tcp_set_skb_tso_segs(buff, tp->mss_cache_std);
+
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+		tp->lost_out += tcp_skb_pcount(skb);
+		tp->left_out += tcp_skb_pcount(skb);
+	}
+
+	if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
+		tp->lost_out += tcp_skb_pcount(buff);
+		tp->left_out += tcp_skb_pcount(buff);
+	}
+
+	/* Link BUFF into the send queue. */
+	__skb_append(skb, buff);
+
+	return 0;
+}
+
+/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c
+ * eventually). The difference is that pulled data not copied, but
+ * immediately discarded.
+ */
+static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
+{
+	int i, k, eat;
+
+	eat = len;
+	k = 0;
+	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
+		if (skb_shinfo(skb)->frags[i].size <= eat) {
+			put_page(skb_shinfo(skb)->frags[i].page);
+			eat -= skb_shinfo(skb)->frags[i].size;
+		} else {
+			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
+			if (eat) {
+				skb_shinfo(skb)->frags[k].page_offset += eat;
+				skb_shinfo(skb)->frags[k].size -= eat;
+				eat = 0;
+			}
+			k++;
+		}
+	}
+	skb_shinfo(skb)->nr_frags = k;
+
+	skb->tail = skb->data;
+	skb->data_len -= len;
+	skb->len = skb->data_len;
+	return skb->tail;
+}
+
+int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
+{
+	if (skb_cloned(skb) &&
+	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+		return -ENOMEM;
+
+	if (len <= skb_headlen(skb)) {
+		__skb_pull(skb, len);
+	} else {
+		if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
+			return -ENOMEM;
+	}
+
+	TCP_SKB_CB(skb)->seq += len;
+	skb->ip_summed = CHECKSUM_HW;
+
+	skb->truesize	     -= len;
+	sk->sk_wmem_queued   -= len;
+	sk->sk_forward_alloc += len;
+	sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
+
+	/* Any change of skb->len requires recalculation of tso
+	 * factor and mss.
+	 */
+	if (tcp_skb_pcount(skb) > 1)
+		tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb));
+
+	return 0;
+}
+
+/* This function synchronize snd mss to current pmtu/exthdr set.
+
+   tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
+   for TCP options, but includes only bare TCP header.
+
+   tp->rx_opt.mss_clamp is mss negotiated at connection setup.
+   It is minumum of user_mss and mss received with SYN.
+   It also does not include TCP options.
+
+   tp->pmtu_cookie is last pmtu, seen by this function.
+
+   tp->mss_cache is current effective sending mss, including
+   all tcp options except for SACKs. It is evaluated,
+   taking into account current pmtu, but never exceeds
+   tp->rx_opt.mss_clamp.
+
+   NOTE1. rfc1122 clearly states that advertised MSS
+   DOES NOT include either tcp or ip options.
+
+   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
+   this function.			--ANK (980731)
+ */
+
+unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int mss_now;
+
+	/* Calculate base mss without TCP options:
+	   It is MMS_S - sizeof(tcphdr) of rfc1122
+	 */
+	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
+
+	/* Clamp it (mss_clamp does not include tcp options) */
+	if (mss_now > tp->rx_opt.mss_clamp)
+		mss_now = tp->rx_opt.mss_clamp;
+
+	/* Now subtract optional transport overhead */
+	mss_now -= tp->ext_header_len;
+
+	/* Then reserve room for full set of TCP options and 8 bytes of data */
+	if (mss_now < 48)
+		mss_now = 48;
+
+	/* Now subtract TCP options size, not including SACKs */
+	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+	/* Bound mss with half of window */
+	if (tp->max_window && mss_now > (tp->max_window>>1))
+		mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
+
+	/* And store cached results */
+	tp->pmtu_cookie = pmtu;
+	tp->mss_cache = tp->mss_cache_std = mss_now;
+
+	return mss_now;
+}
+
+/* Compute the current effective MSS, taking SACKs and IP options,
+ * and even PMTU discovery events into account.
+ *
+ * LARGESEND note: !urg_mode is overkill, only frames up to snd_up
+ * cannot be large. However, taking into account rare use of URG, this
+ * is not a big flaw.
+ */
+
+unsigned int tcp_current_mss(struct sock *sk, int large)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct dst_entry *dst = __sk_dst_get(sk);
+	unsigned int do_large, mss_now;
+
+	mss_now = tp->mss_cache_std;
+	if (dst) {
+		u32 mtu = dst_mtu(dst);
+		if (mtu != tp->pmtu_cookie)
+			mss_now = tcp_sync_mss(sk, mtu);
+	}
+
+	do_large = (large &&
+		    (sk->sk_route_caps & NETIF_F_TSO) &&
+		    !tp->urg_mode);
+
+	if (do_large) {
+		unsigned int large_mss, factor, limit;
+
+		large_mss = 65535 - tp->af_specific->net_header_len -
+			tp->ext_header_len - tp->tcp_header_len;
+
+		if (tp->max_window && large_mss > (tp->max_window>>1))
+			large_mss = max((tp->max_window>>1),
+					68U - tp->tcp_header_len);
+
+		factor = large_mss / mss_now;
+
+		/* Always keep large mss multiple of real mss, but
+		 * do not exceed 1/tso_win_divisor of the congestion window
+		 * so we can keep the ACK clock ticking and minimize
+		 * bursting.
+		 */
+		limit = tp->snd_cwnd;
+		if (sysctl_tcp_tso_win_divisor)
+			limit /= sysctl_tcp_tso_win_divisor;
+		limit = max(1U, limit);
+		if (factor > limit)
+			factor = limit;
+
+		tp->mss_cache = mss_now * factor;
+
+		mss_now = tp->mss_cache;
+	}
+
+	if (tp->rx_opt.eff_sacks)
+		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
+			    (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+	return mss_now;
+}
+
+/* This routine writes packets to the network.  It advances the
+ * send_head.  This happens as incoming acks open up the remote
+ * window for us.
+ *
+ * Returns 1, if no segments are in flight and we have queued segments, but
+ * cannot send anything now because of SWS or another problem.
+ */
+int tcp_write_xmit(struct sock *sk, int nonagle)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	unsigned int mss_now;
+
+	/* If we are closed, the bytes will have to remain here.
+	 * In time closedown will finish, we empty the write queue and all
+	 * will be happy.
+	 */
+	if (sk->sk_state != TCP_CLOSE) {
+		struct sk_buff *skb;
+		int sent_pkts = 0;
+
+		/* Account for SACKS, we may need to fragment due to this.
+		 * It is just like the real MSS changing on us midstream.
+		 * We also handle things correctly when the user adds some
+		 * IP options mid-stream.  Silly to do, but cover it.
+		 */
+		mss_now = tcp_current_mss(sk, 1);
+
+		while ((skb = sk->sk_send_head) &&
+		       tcp_snd_test(tp, skb, mss_now,
+			       	    tcp_skb_is_last(sk, skb) ? nonagle :
+				    			       TCP_NAGLE_PUSH)) {
+			if (skb->len > mss_now) {
+				if (tcp_fragment(sk, skb, mss_now))
+					break;
+			}
+
+			TCP_SKB_CB(skb)->when = tcp_time_stamp;
+			tcp_tso_set_push(skb);
+			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))
+				break;
+
+			/* Advance the send_head.  This one is sent out.
+			 * This call will increment packets_out.
+			 */
+			update_send_head(sk, tp, skb);
+
+			tcp_minshall_update(tp, mss_now, skb);
+			sent_pkts = 1;
+		}
+
+		if (sent_pkts) {
+			tcp_cwnd_validate(sk, tp);
+			return 0;
+		}
+
+		return !tp->packets_out && sk->sk_send_head;
+	}
+	return 0;
+}
+
+/* This function returns the amount that we can raise the
+ * usable window based on the following constraints
+ *  
+ * 1. The window can never be shrunk once it is offered (RFC 793)
+ * 2. We limit memory per socket
+ *
+ * RFC 1122:
+ * "the suggested [SWS] avoidance algorithm for the receiver is to keep
+ *  RECV.NEXT + RCV.WIN fixed until:
+ *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)"
+ *
+ * i.e. don't raise the right edge of the window until you can raise
+ * it at least MSS bytes.
+ *
+ * Unfortunately, the recommended algorithm breaks header prediction,
+ * since header prediction assumes th->window stays fixed.
+ *
+ * Strictly speaking, keeping th->window fixed violates the receiver
+ * side SWS prevention criteria. The problem is that under this rule
+ * a stream of single byte packets will cause the right side of the
+ * window to always advance by a single byte.
+ * 
+ * Of course, if the sender implements sender side SWS prevention
+ * then this will not be a problem.
+ * 
+ * BSD seems to make the following compromise:
+ * 
+ *	If the free space is less than the 1/4 of the maximum
+ *	space available and the free space is less than 1/2 mss,
+ *	then set the window to 0.
+ *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ]
+ *	Otherwise, just prevent the window from shrinking
+ *	and from being larger than the largest representable value.
+ *
+ * This prevents incremental opening of the window in the regime
+ * where TCP is limited by the speed of the reader side taking
+ * data out of the TCP receive queue. It does nothing about
+ * those cases where the window is constrained on the sender side
+ * because the pipeline is full.
+ *
+ * BSD also seems to "accidentally" limit itself to windows that are a
+ * multiple of MSS, at least until the free space gets quite small.
+ * This would appear to be a side effect of the mbuf implementation.
+ * Combining these two algorithms results in the observed behavior
+ * of having a fixed window size at almost all times.
+ *
+ * Below we obtain similar behavior by forcing the offered window to
+ * a multiple of the mss when it is feasible to do so.
+ *
+ * Note, we don't "adjust" for TIMESTAMP or SACK option bytes.
+ * Regular options like TIMESTAMP are taken into account.
+ */
+u32 __tcp_select_window(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	/* MSS for the peer's data.  Previous verions used mss_clamp
+	 * here.  I don't know if the value based on our guesses
+	 * of peer's MSS is better for the performance.  It's more correct
+	 * but may be worse for the performance because of rcv_mss
+	 * fluctuations.  --SAW  1998/11/1
+	 */
+	int mss = tp->ack.rcv_mss;
+	int free_space = tcp_space(sk);
+	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
+	int window;
+
+	if (mss > full_space)
+		mss = full_space; 
+
+	if (free_space < full_space/2) {
+		tp->ack.quick = 0;
+
+		if (tcp_memory_pressure)
+			tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
+
+		if (free_space < mss)
+			return 0;
+	}
+
+	if (free_space > tp->rcv_ssthresh)
+		free_space = tp->rcv_ssthresh;
+
+	/* Don't do rounding if we are using window scaling, since the
+	 * scaled window will not line up with the MSS boundary anyway.
+	 */
+	window = tp->rcv_wnd;
+	if (tp->rx_opt.rcv_wscale) {
+		window = free_space;
+
+		/* Advertise enough space so that it won't get scaled away.
+		 * Import case: prevent zero window announcement if
+		 * 1<<rcv_wscale > mss.
+		 */
+		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)
+			window = (((window >> tp->rx_opt.rcv_wscale) + 1)
+				  << tp->rx_opt.rcv_wscale);
+	} else {
+		/* Get the largest window that is a nice multiple of mss.
+		 * Window clamp already applied above.
+		 * If our current window offering is within 1 mss of the
+		 * free space we just keep it. This prevents the divide
+		 * and multiply from happening most of the time.
+		 * We also don't do any window rounding when the free space
+		 * is too small.
+		 */
+		if (window <= free_space - mss || window > free_space)
+			window = (free_space/mss)*mss;
+	}
+
+	return window;
+}
+
+/* Attempt to collapse two adjacent SKB's during retransmission. */
+static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *next_skb = skb->next;
+
+	/* The first test we must make is that neither of these two
+	 * SKB's are still referenced by someone else.
+	 */
+	if (!skb_cloned(skb) && !skb_cloned(next_skb)) {
+		int skb_size = skb->len, next_skb_size = next_skb->len;
+		u16 flags = TCP_SKB_CB(skb)->flags;
+
+		/* Also punt if next skb has been SACK'd. */
+		if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)
+			return;
+
+		/* Next skb is out of window. */
+		if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))
+			return;
+
+		/* Punt if not enough space exists in the first SKB for
+		 * the data in the second, or the total combined payload
+		 * would exceed the MSS.
+		 */
+		if ((next_skb_size > skb_tailroom(skb)) ||
+		    ((skb_size + next_skb_size) > mss_now))
+			return;
+
+		BUG_ON(tcp_skb_pcount(skb) != 1 ||
+		       tcp_skb_pcount(next_skb) != 1);
+
+		/* Ok.  We will be able to collapse the packet. */
+		__skb_unlink(next_skb, next_skb->list);
+
+		memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
+
+		if (next_skb->ip_summed == CHECKSUM_HW)
+			skb->ip_summed = CHECKSUM_HW;
+
+		if (skb->ip_summed != CHECKSUM_HW)
+			skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
+
+		/* Update sequence range on original skb. */
+		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
+
+		/* Merge over control information. */
+		flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
+		TCP_SKB_CB(skb)->flags = flags;
+
+		/* All done, get rid of second SKB and account for it so
+		 * packet counting does not break.
+		 */
+		TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);
+		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)
+			tp->retrans_out -= tcp_skb_pcount(next_skb);
+		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) {
+			tp->lost_out -= tcp_skb_pcount(next_skb);
+			tp->left_out -= tcp_skb_pcount(next_skb);
+		}
+		/* Reno case is special. Sigh... */
+		if (!tp->rx_opt.sack_ok && tp->sacked_out) {
+			tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
+			tp->left_out -= tcp_skb_pcount(next_skb);
+		}
+
+		/* Not quite right: it can be > snd.fack, but
+		 * it is better to underestimate fackets.
+		 */
+		tcp_dec_pcount_approx(&tp->fackets_out, next_skb);
+		tcp_packets_out_dec(tp, next_skb);
+		sk_stream_free_skb(sk, next_skb);
+	}
+}
+
+/* Do a simple retransmit without using the backoff mechanisms in
+ * tcp_timer. This is used for path mtu discovery. 
+ * The socket is already locked here.
+ */ 
+void tcp_simple_retransmit(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	unsigned int mss = tcp_current_mss(sk, 0);
+	int lost = 0;
+
+	sk_stream_for_retrans_queue(skb, sk) {
+		if (skb->len > mss && 
+		    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
+			if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+				tp->retrans_out -= tcp_skb_pcount(skb);
+			}
+			if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {
+				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+				tp->lost_out += tcp_skb_pcount(skb);
+				lost = 1;
+			}
+		}
+	}
+
+	if (!lost)
+		return;
+
+	tcp_sync_left_out(tp);
+
+ 	/* Don't muck with the congestion window here.
+	 * Reason is that we do not increase amount of _data_
+	 * in network, but units changed and effective
+	 * cwnd/ssthresh really reduced now.
+	 */
+	if (tp->ca_state != TCP_CA_Loss) {
+		tp->high_seq = tp->snd_nxt;
+		tp->snd_ssthresh = tcp_current_ssthresh(tp);
+		tp->prior_ssthresh = 0;
+		tp->undo_marker = 0;
+		tcp_set_ca_state(tp, TCP_CA_Loss);
+	}
+	tcp_xmit_retransmit_queue(sk);
+}
+
+/* This retransmits one SKB.  Policy decisions and retransmit queue
+ * state updates are done by the caller.  Returns non-zero if an
+ * error occurred which prevented the send.
+ */
+int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+ 	unsigned int cur_mss = tcp_current_mss(sk, 0);
+	int err;
+
+	/* Do not sent more than we queued. 1/4 is reserved for possible
+	 * copying overhead: frgagmentation, tunneling, mangling etc.
+	 */
+	if (atomic_read(&sk->sk_wmem_alloc) >
+	    min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
+		return -EAGAIN;
+
+	if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
+		if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
+			BUG();
+
+		if (sk->sk_route_caps & NETIF_F_TSO) {
+			sk->sk_route_caps &= ~NETIF_F_TSO;
+			sock_set_flag(sk, SOCK_NO_LARGESEND);
+			tp->mss_cache = tp->mss_cache_std;
+		}
+
+		if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
+			return -ENOMEM;
+	}
+
+	/* If receiver has shrunk his window, and skb is out of
+	 * new window, do not retransmit it. The exception is the
+	 * case, when window is shrunk to zero. In this case
+	 * our retransmit serves as a zero window probe.
+	 */
+	if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)
+	    && TCP_SKB_CB(skb)->seq != tp->snd_una)
+		return -EAGAIN;
+
+	if (skb->len > cur_mss) {
+		int old_factor = tcp_skb_pcount(skb);
+		int new_factor;
+
+		if (tcp_fragment(sk, skb, cur_mss))
+			return -ENOMEM; /* We'll try again later. */
+
+		/* New SKB created, account for it. */
+		new_factor = tcp_skb_pcount(skb);
+		tp->packets_out -= old_factor - new_factor;
+		tp->packets_out += tcp_skb_pcount(skb->next);
+	}
+
+	/* Collapse two adjacent packets if worthwhile and we can. */
+	if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) &&
+	   (skb->len < (cur_mss >> 1)) &&
+	   (skb->next != sk->sk_send_head) &&
+	   (skb->next != (struct sk_buff *)&sk->sk_write_queue) &&
+	   (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) &&
+	   (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) &&
+	   (sysctl_tcp_retrans_collapse != 0))
+		tcp_retrans_try_collapse(sk, skb, cur_mss);
+
+	if(tp->af_specific->rebuild_header(sk))
+		return -EHOSTUNREACH; /* Routing failure or similar. */
+
+	/* Some Solaris stacks overoptimize and ignore the FIN on a
+	 * retransmit when old data is attached.  So strip it off
+	 * since it is cheap to do so and saves bytes on the network.
+	 */
+	if(skb->len > 0 &&
+	   (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+	   tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
+		if (!pskb_trim(skb, 0)) {
+			TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
+			skb_shinfo(skb)->tso_segs = 1;
+			skb_shinfo(skb)->tso_size = 0;
+			skb->ip_summed = CHECKSUM_NONE;
+			skb->csum = 0;
+		}
+	}
+
+	/* Make a copy, if the first transmission SKB clone we made
+	 * is still in somebody's hands, else make a clone.
+	 */
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	tcp_tso_set_push(skb);
+
+	err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
+				    pskb_copy(skb, GFP_ATOMIC):
+				    skb_clone(skb, GFP_ATOMIC)));
+
+	if (err == 0) {
+		/* Update global TCP statistics. */
+		TCP_INC_STATS(TCP_MIB_RETRANSSEGS);
+
+		tp->total_retrans++;
+
+#if FASTRETRANS_DEBUG > 0
+		if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
+			if (net_ratelimit())
+				printk(KERN_DEBUG "retrans_out leaked.\n");
+		}
+#endif
+		TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS;
+		tp->retrans_out += tcp_skb_pcount(skb);
+
+		/* Save stamp of the first retransmit. */
+		if (!tp->retrans_stamp)
+			tp->retrans_stamp = TCP_SKB_CB(skb)->when;
+
+		tp->undo_retrans++;
+
+		/* snd_nxt is stored to detect loss of retransmitted segment,
+		 * see tcp_input.c tcp_sacktag_write_queue().
+		 */
+		TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt;
+	}
+	return err;
+}
+
+/* This gets called after a retransmit timeout, and the initially
+ * retransmitted data is acknowledged.  It tries to continue
+ * resending the rest of the retransmit queue, until either
+ * we've sent it all or the congestion window limit is reached.
+ * If doing SACK, the first ACK which comes back for a timeout
+ * based retransmit packet might feed us FACK information again.
+ * If so, we use it to avoid unnecessarily retransmissions.
+ */
+void tcp_xmit_retransmit_queue(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+	int packet_cnt = tp->lost_out;
+
+	/* First pass: retransmit lost packets. */
+	if (packet_cnt) {
+		sk_stream_for_retrans_queue(skb, sk) {
+			__u8 sacked = TCP_SKB_CB(skb)->sacked;
+
+			/* Assume this retransmit will generate
+			 * only one packet for congestion window
+			 * calculation purposes.  This works because
+			 * tcp_retransmit_skb() will chop up the
+			 * packet to be MSS sized and all the
+			 * packet counting works out.
+			 */
+			if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+				return;
+
+			if (sacked&TCPCB_LOST) {
+				if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
+					if (tcp_retransmit_skb(sk, skb))
+						return;
+					if (tp->ca_state != TCP_CA_Loss)
+						NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
+					else
+						NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
+
+					if (skb ==
+					    skb_peek(&sk->sk_write_queue))
+						tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+				}
+
+				packet_cnt -= tcp_skb_pcount(skb);
+				if (packet_cnt <= 0)
+					break;
+			}
+		}
+	}
+
+	/* OK, demanded retransmission is finished. */
+
+	/* Forward retransmissions are possible only during Recovery. */
+	if (tp->ca_state != TCP_CA_Recovery)
+		return;
+
+	/* No forward retransmissions in Reno are possible. */
+	if (!tp->rx_opt.sack_ok)
+		return;
+
+	/* Yeah, we have to make difficult choice between forward transmission
+	 * and retransmission... Both ways have their merits...
+	 *
+	 * For now we do not retransmit anything, while we have some new
+	 * segments to send.
+	 */
+
+	if (tcp_may_send_now(sk, tp))
+		return;
+
+	packet_cnt = 0;
+
+	sk_stream_for_retrans_queue(skb, sk) {
+		/* Similar to the retransmit loop above we
+		 * can pretend that the retransmitted SKB
+		 * we send out here will be composed of one
+		 * real MSS sized packet because tcp_retransmit_skb()
+		 * will fragment it if necessary.
+		 */
+		if (++packet_cnt > tp->fackets_out)
+			break;
+
+		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
+			break;
+
+		if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS)
+			continue;
+
+		/* Ok, retransmit it. */
+		if (tcp_retransmit_skb(sk, skb))
+			break;
+
+		if (skb == skb_peek(&sk->sk_write_queue))
+			tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+
+		NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
+	}
+}
+
+
+/* Send a fin.  The caller locks the socket for us.  This cannot be
+ * allowed to fail queueing a FIN frame under any circumstances.
+ */
+void tcp_send_fin(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);	
+	struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
+	int mss_now;
+	
+	/* Optimization, tack on the FIN if we have a queue of
+	 * unsent frames.  But be careful about outgoing SACKS
+	 * and IP options.
+	 */
+	mss_now = tcp_current_mss(sk, 1);
+
+	if (sk->sk_send_head != NULL) {
+		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
+		TCP_SKB_CB(skb)->end_seq++;
+		tp->write_seq++;
+	} else {
+		/* Socket is locked, keep trying until memory is available. */
+		for (;;) {
+			skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+			if (skb)
+				break;
+			yield();
+		}
+
+		/* Reserve space for headers and prepare control bits. */
+		skb_reserve(skb, MAX_TCP_HEADER);
+		skb->csum = 0;
+		TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
+		TCP_SKB_CB(skb)->sacked = 0;
+		skb_shinfo(skb)->tso_segs = 1;
+		skb_shinfo(skb)->tso_size = 0;
+
+		/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
+		TCP_SKB_CB(skb)->seq = tp->write_seq;
+		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+		tcp_queue_skb(sk, skb);
+	}
+	__tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF);
+}
+
+/* We get here when a process closes a file descriptor (either due to
+ * an explicit close() or as a byproduct of exit()'ing) and there
+ * was unread data in the receive queue.  This behavior is recommended
+ * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
+ */
+void tcp_send_active_reset(struct sock *sk, int priority)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	/* NOTE: No TCP options attached and we never retransmit this. */
+	skb = alloc_skb(MAX_TCP_HEADER, priority);
+	if (!skb) {
+		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+		return;
+	}
+
+	/* Reserve space for headers and prepare control bits. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+	skb->csum = 0;
+	TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
+	TCP_SKB_CB(skb)->sacked = 0;
+	skb_shinfo(skb)->tso_segs = 1;
+	skb_shinfo(skb)->tso_size = 0;
+
+	/* Send it off. */
+	TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	if (tcp_transmit_skb(sk, skb))
+		NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
+}
+
+/* WARNING: This routine must only be called when we have already sent
+ * a SYN packet that crossed the incoming SYN that caused this routine
+ * to get called. If this assumption fails then the initial rcv_wnd
+ * and rcv_wscale values will not be correct.
+ */
+int tcp_send_synack(struct sock *sk)
+{
+	struct sk_buff* skb;
+
+	skb = skb_peek(&sk->sk_write_queue);
+	if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) {
+		printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n");
+		return -EFAULT;
+	}
+	if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) {
+		if (skb_cloned(skb)) {
+			struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
+			if (nskb == NULL)
+				return -ENOMEM;
+			__skb_unlink(skb, &sk->sk_write_queue);
+			skb_header_release(nskb);
+			__skb_queue_head(&sk->sk_write_queue, nskb);
+			sk_stream_free_skb(sk, skb);
+			sk_charge_skb(sk, nskb);
+			skb = nskb;
+		}
+
+		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK;
+		TCP_ECN_send_synack(tcp_sk(sk), skb);
+	}
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+}
+
+/*
+ * Prepare a SYN-ACK.
+ */
+struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
+				 struct open_request *req)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th;
+	int tcp_header_size;
+	struct sk_buff *skb;
+
+	skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+	if (skb == NULL)
+		return NULL;
+
+	/* Reserve space for headers. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+
+	skb->dst = dst_clone(dst);
+
+	tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS +
+			   (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) +
+			   (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
+			   /* SACK_PERM is in the place of NOP NOP of TS */
+			   ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+	skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+
+	memset(th, 0, sizeof(struct tcphdr));
+	th->syn = 1;
+	th->ack = 1;
+	if (dst->dev->features&NETIF_F_TSO)
+		req->ecn_ok = 0;
+	TCP_ECN_make_synack(req, th);
+	th->source = inet_sk(sk)->sport;
+	th->dest = req->rmt_port;
+	TCP_SKB_CB(skb)->seq = req->snt_isn;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
+	TCP_SKB_CB(skb)->sacked = 0;
+	skb_shinfo(skb)->tso_segs = 1;
+	skb_shinfo(skb)->tso_size = 0;
+	th->seq = htonl(TCP_SKB_CB(skb)->seq);
+	th->ack_seq = htonl(req->rcv_isn + 1);
+	if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
+		__u8 rcv_wscale; 
+		/* Set this up on the first call only */
+		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+		/* tcp_full_space because it is guaranteed to be the first packet */
+		tcp_select_initial_window(tcp_full_space(sk), 
+			dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+			&req->rcv_wnd,
+			&req->window_clamp,
+			req->wscale_ok,
+			&rcv_wscale);
+		req->rcv_wscale = rcv_wscale; 
+	}
+
+	/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
+	th->window = htons(req->rcv_wnd);
+
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok,
+			      req->sack_ok, req->wscale_ok, req->rcv_wscale,
+			      TCP_SKB_CB(skb)->when,
+			      req->ts_recent);
+
+	skb->csum = 0;
+	th->doff = (tcp_header_size >> 2);
+	TCP_INC_STATS(TCP_MIB_OUTSEGS);
+	return skb;
+}
+
+/* 
+ * Do all connect socket setups that can be done AF independent.
+ */ 
+static inline void tcp_connect_init(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u8 rcv_wscale;
+
+	/* We'll fix this up when we get a response from the other end.
+	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
+	 */
+	tp->tcp_header_len = sizeof(struct tcphdr) +
+		(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
+	/* If user gave his TCP_MAXSEG, record it to clamp */
+	if (tp->rx_opt.user_mss)
+		tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+	tp->max_window = 0;
+	tcp_sync_mss(sk, dst_mtu(dst));
+
+	if (!tp->window_clamp)
+		tp->window_clamp = dst_metric(dst, RTAX_WINDOW);
+	tp->advmss = dst_metric(dst, RTAX_ADVMSS);
+	tcp_initialize_rcv_mss(sk);
+	tcp_ca_init(tp);
+
+	tcp_select_initial_window(tcp_full_space(sk),
+				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+				  &tp->rcv_wnd,
+				  &tp->window_clamp,
+				  sysctl_tcp_window_scaling,
+				  &rcv_wscale);
+
+	tp->rx_opt.rcv_wscale = rcv_wscale;
+	tp->rcv_ssthresh = tp->rcv_wnd;
+
+	sk->sk_err = 0;
+	sock_reset_flag(sk, SOCK_DONE);
+	tp->snd_wnd = 0;
+	tcp_init_wl(tp, tp->write_seq, 0);
+	tp->snd_una = tp->write_seq;
+	tp->snd_sml = tp->write_seq;
+	tp->rcv_nxt = 0;
+	tp->rcv_wup = 0;
+	tp->copied_seq = 0;
+
+	tp->rto = TCP_TIMEOUT_INIT;
+	tp->retransmits = 0;
+	tcp_clear_retrans(tp);
+}
+
+/*
+ * Build a SYN and send it off.
+ */ 
+int tcp_connect(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *buff;
+
+	tcp_connect_init(sk);
+
+	buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+	if (unlikely(buff == NULL))
+		return -ENOBUFS;
+
+	/* Reserve space for headers. */
+	skb_reserve(buff, MAX_TCP_HEADER);
+
+	TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
+	TCP_ECN_send_syn(sk, tp, buff);
+	TCP_SKB_CB(buff)->sacked = 0;
+	skb_shinfo(buff)->tso_segs = 1;
+	skb_shinfo(buff)->tso_size = 0;
+	buff->csum = 0;
+	TCP_SKB_CB(buff)->seq = tp->write_seq++;
+	TCP_SKB_CB(buff)->end_seq = tp->write_seq;
+	tp->snd_nxt = tp->write_seq;
+	tp->pushed_seq = tp->write_seq;
+	tcp_ca_init(tp);
+
+	/* Send it off. */
+	TCP_SKB_CB(buff)->when = tcp_time_stamp;
+	tp->retrans_stamp = TCP_SKB_CB(buff)->when;
+	skb_header_release(buff);
+	__skb_queue_tail(&sk->sk_write_queue, buff);
+	sk_charge_skb(sk, buff);
+	tp->packets_out += tcp_skb_pcount(buff);
+	tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+	TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
+
+	/* Timer for repeating the SYN until an answer. */
+	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+	return 0;
+}
+
+/* Send out a delayed ack, the caller does the policy checking
+ * to see if we should even be here.  See tcp_input.c:tcp_ack_snd_check()
+ * for details.
+ */
+void tcp_send_delayed_ack(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int ato = tp->ack.ato;
+	unsigned long timeout;
+
+	if (ato > TCP_DELACK_MIN) {
+		int max_ato = HZ/2;
+
+		if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+			max_ato = TCP_DELACK_MAX;
+
+		/* Slow path, intersegment interval is "high". */
+
+		/* If some rtt estimate is known, use it to bound delayed ack.
+		 * Do not use tp->rto here, use results of rtt measurements
+		 * directly.
+		 */
+		if (tp->srtt) {
+			int rtt = max(tp->srtt>>3, TCP_DELACK_MIN);
+
+			if (rtt < max_ato)
+				max_ato = rtt;
+		}
+
+		ato = min(ato, max_ato);
+	}
+
+	/* Stay within the limit we were given */
+	timeout = jiffies + ato;
+
+	/* Use new timeout only if there wasn't a older one earlier. */
+	if (tp->ack.pending&TCP_ACK_TIMER) {
+		/* If delack timer was blocked or is about to expire,
+		 * send ACK now.
+		 */
+		if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+			tcp_send_ack(sk);
+			return;
+		}
+
+		if (!time_before(timeout, tp->ack.timeout))
+			timeout = tp->ack.timeout;
+	}
+	tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
+	tp->ack.timeout = timeout;
+	sk_reset_timer(sk, &tp->delack_timer, timeout);
+}
+
+/* This routine sends an ack and also updates the window. */
+void tcp_send_ack(struct sock *sk)
+{
+	/* If we have been reset, we may not send again. */
+	if (sk->sk_state != TCP_CLOSE) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct sk_buff *buff;
+
+		/* We are not putting this on the write queue, so
+		 * tcp_transmit_skb() will set the ownership to this
+		 * sock.
+		 */
+		buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+		if (buff == NULL) {
+			tcp_schedule_ack(tp);
+			tp->ack.ato = TCP_ATO_MIN;
+			tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+			return;
+		}
+
+		/* Reserve space for headers and prepare control bits. */
+		skb_reserve(buff, MAX_TCP_HEADER);
+		buff->csum = 0;
+		TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
+		TCP_SKB_CB(buff)->sacked = 0;
+		skb_shinfo(buff)->tso_segs = 1;
+		skb_shinfo(buff)->tso_size = 0;
+
+		/* Send it off, this clears delayed acks for us. */
+		TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
+		TCP_SKB_CB(buff)->when = tcp_time_stamp;
+		tcp_transmit_skb(sk, buff);
+	}
+}
+
+/* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
+ *
+ * Question: what should we make while urgent mode?
+ * 4.4BSD forces sending single byte of data. We cannot send
+ * out of window data, because we have SND.NXT==SND.MAX...
+ *
+ * Current solution: to send TWO zero-length segments in urgent mode:
+ * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is
+ * out-of-date with SND.UNA-1 to probe window.
+ */
+static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct sk_buff *skb;
+
+	/* We don't queue it, tcp_transmit_skb() sets ownership. */
+	skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
+	if (skb == NULL) 
+		return -1;
+
+	/* Reserve space for headers and set control bits. */
+	skb_reserve(skb, MAX_TCP_HEADER);
+	skb->csum = 0;
+	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
+	TCP_SKB_CB(skb)->sacked = urgent;
+	skb_shinfo(skb)->tso_segs = 1;
+	skb_shinfo(skb)->tso_size = 0;
+
+	/* Use a previous sequence.  This should cause the other
+	 * end to send an ack.  Don't queue or clone SKB, just
+	 * send it.
+	 */
+	TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
+	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
+	TCP_SKB_CB(skb)->when = tcp_time_stamp;
+	return tcp_transmit_skb(sk, skb);
+}
+
+int tcp_write_wakeup(struct sock *sk)
+{
+	if (sk->sk_state != TCP_CLOSE) {
+		struct tcp_sock *tp = tcp_sk(sk);
+		struct sk_buff *skb;
+
+		if ((skb = sk->sk_send_head) != NULL &&
+		    before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) {
+			int err;
+			unsigned int mss = tcp_current_mss(sk, 0);
+			unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq;
+
+			if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
+				tp->pushed_seq = TCP_SKB_CB(skb)->end_seq;
+
+			/* We are probing the opening of a window
+			 * but the window size is != 0
+			 * must have been a result SWS avoidance ( sender )
+			 */
+			if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq ||
+			    skb->len > mss) {
+				seg_size = min(seg_size, mss);
+				TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+				if (tcp_fragment(sk, skb, seg_size))
+					return -1;
+				/* SWS override triggered forced fragmentation.
+				 * Disable TSO, the connection is too sick. */
+				if (sk->sk_route_caps & NETIF_F_TSO) {
+					sock_set_flag(sk, SOCK_NO_LARGESEND);
+					sk->sk_route_caps &= ~NETIF_F_TSO;
+					tp->mss_cache = tp->mss_cache_std;
+				}
+			} else if (!tcp_skb_pcount(skb))
+				tcp_set_skb_tso_segs(skb, tp->mss_cache_std);
+
+			TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
+			TCP_SKB_CB(skb)->when = tcp_time_stamp;
+			tcp_tso_set_push(skb);
+			err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+			if (!err) {
+				update_send_head(sk, tp, skb);
+			}
+			return err;
+		} else {
+			if (tp->urg_mode &&
+			    between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF))
+				tcp_xmit_probe_skb(sk, TCPCB_URG);
+			return tcp_xmit_probe_skb(sk, 0);
+		}
+	}
+	return -1;
+}
+
+/* A window probe timeout has occurred.  If window is not closed send
+ * a partial packet else a zero probe.
+ */
+void tcp_send_probe0(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int err;
+
+	err = tcp_write_wakeup(sk);
+
+	if (tp->packets_out || !sk->sk_send_head) {
+		/* Cancel probe timer, if it is not required. */
+		tp->probes_out = 0;
+		tp->backoff = 0;
+		return;
+	}
+
+	if (err <= 0) {
+		if (tp->backoff < sysctl_tcp_retries2)
+			tp->backoff++;
+		tp->probes_out++;
+		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+				      min(tp->rto << tp->backoff, TCP_RTO_MAX));
+	} else {
+		/* If packet was not sent due to local congestion,
+		 * do not backoff and do not remember probes_out.
+		 * Let local senders to fight for local resources.
+		 *
+		 * Use accumulated backoff yet.
+		 */
+		if (!tp->probes_out)
+			tp->probes_out=1;
+		tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
+				      min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+	}
+}
+
+EXPORT_SYMBOL(tcp_connect);
+EXPORT_SYMBOL(tcp_make_synack);
+EXPORT_SYMBOL(tcp_simple_retransmit);
+EXPORT_SYMBOL(tcp_sync_mss);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
new file mode 100644
index 000000000000..85b279f1e935
--- /dev/null
+++ b/net/ipv4/tcp_timer.c
@@ -0,0 +1,656 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Implementation of the Transmission Control Protocol(TCP).
+ *
+ * Version:	$Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Mark Evans, <evansmp@uhura.aston.ac.uk>
+ *		Corey Minyard <wf-rch!minyard@relay.EU.net>
+ *		Florian La Roche, <flla@stud.uni-sb.de>
+ *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
+ *		Linus Torvalds, <torvalds@cs.helsinki.fi>
+ *		Alan Cox, <gw4pts@gw4pts.ampr.org>
+ *		Matthew Dillon, <dillon@apollo.west.oic.com>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Jorge Cwik, <jorge@laser.satlink.net>
+ */
+
+#include <linux/module.h>
+#include <net/tcp.h>
+
+int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; 
+int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; 
+int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+int sysctl_tcp_retries1 = TCP_RETR1;
+int sysctl_tcp_retries2 = TCP_RETR2;
+int sysctl_tcp_orphan_retries;
+
+static void tcp_write_timer(unsigned long);
+static void tcp_delack_timer(unsigned long);
+static void tcp_keepalive_timer (unsigned long data);
+
+#ifdef TCP_DEBUG
+const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n";
+EXPORT_SYMBOL(tcp_timer_bug_msg);
+#endif
+
+/*
+ * Using different timers for retransmit, delayed acks and probes
+ * We may wish use just one timer maintaining a list of expire jiffies 
+ * to optimize.
+ */
+
+void tcp_init_xmit_timers(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	init_timer(&tp->retransmit_timer);
+	tp->retransmit_timer.function=&tcp_write_timer;
+	tp->retransmit_timer.data = (unsigned long) sk;
+	tp->pending = 0;
+
+	init_timer(&tp->delack_timer);
+	tp->delack_timer.function=&tcp_delack_timer;
+	tp->delack_timer.data = (unsigned long) sk;
+	tp->ack.pending = 0;
+
+	init_timer(&sk->sk_timer);
+	sk->sk_timer.function	= &tcp_keepalive_timer;
+	sk->sk_timer.data	= (unsigned long)sk;
+}
+
+void tcp_clear_xmit_timers(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	tp->pending = 0;
+	sk_stop_timer(sk, &tp->retransmit_timer);
+
+	tp->ack.pending = 0;
+	tp->ack.blocked = 0;
+	sk_stop_timer(sk, &tp->delack_timer);
+
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+static void tcp_write_err(struct sock *sk)
+{
+	sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
+	sk->sk_error_report(sk);
+
+	tcp_done(sk);
+	NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
+}
+
+/* Do not allow orphaned sockets to eat all our resources.
+ * This is direct violation of TCP specs, but it is required
+ * to prevent DoS attacks. It is called when a retransmission timeout
+ * or zero probe timeout occurs on orphaned socket.
+ *
+ * Criterium is still not confirmed experimentally and may change.
+ * We kill the socket, if:
+ * 1. If number of orphaned sockets exceeds an administratively configured
+ *    limit.
+ * 2. If we have strong memory pressure.
+ */
+static int tcp_out_of_resources(struct sock *sk, int do_reset)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int orphans = atomic_read(&tcp_orphan_count);
+
+	/* If peer does not open window for long time, or did not transmit 
+	 * anything for long time, penalize it. */
+	if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
+		orphans <<= 1;
+
+	/* If some dubious ICMP arrived, penalize even more. */
+	if (sk->sk_err_soft)
+		orphans <<= 1;
+
+	if (orphans >= sysctl_tcp_max_orphans ||
+	    (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
+	     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
+		if (net_ratelimit())
+			printk(KERN_INFO "Out of socket memory\n");
+
+		/* Catch exceptional cases, when connection requires reset.
+		 *      1. Last segment was sent recently. */
+		if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
+		    /*  2. Window is closed. */
+		    (!tp->snd_wnd && !tp->packets_out))
+			do_reset = 1;
+		if (do_reset)
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+		tcp_done(sk);
+		NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
+		return 1;
+	}
+	return 0;
+}
+
+/* Calculate maximal number or retries on an orphaned socket. */
+static int tcp_orphan_retries(struct sock *sk, int alive)
+{
+	int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+
+	/* We know from an ICMP that something is wrong. */
+	if (sk->sk_err_soft && !alive)
+		retries = 0;
+
+	/* However, if socket sent something recently, select some safe
+	 * number of retries. 8 corresponds to >100 seconds with minimal
+	 * RTO of 200msec. */
+	if (retries == 0 && alive)
+		retries = 8;
+	return retries;
+}
+
+/* A write timeout has occurred. Process the after effects. */
+static int tcp_write_timeout(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int retry_until;
+
+	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
+		if (tp->retransmits)
+			dst_negative_advice(&sk->sk_dst_cache);
+		retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries;
+	} else {
+		if (tp->retransmits >= sysctl_tcp_retries1) {
+			/* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
+			   hole detection. :-(
+
+			   It is place to make it. It is not made. I do not want
+			   to make it. It is disguisting. It does not work in any
+			   case. Let me to cite the same draft, which requires for
+			   us to implement this:
+
+   "The one security concern raised by this memo is that ICMP black holes
+   are often caused by over-zealous security administrators who block
+   all ICMP messages.  It is vitally important that those who design and
+   deploy security systems understand the impact of strict filtering on
+   upper-layer protocols.  The safest web site in the world is worthless
+   if most TCP implementations cannot transfer data from it.  It would
+   be far nicer to have all of the black holes fixed rather than fixing
+   all of the TCP implementations."
+
+                           Golden words :-).
+		   */
+
+			dst_negative_advice(&sk->sk_dst_cache);
+		}
+
+		retry_until = sysctl_tcp_retries2;
+		if (sock_flag(sk, SOCK_DEAD)) {
+			int alive = (tp->rto < TCP_RTO_MAX);
+ 
+			retry_until = tcp_orphan_retries(sk, alive);
+
+			if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until))
+				return 1;
+		}
+	}
+
+	if (tp->retransmits >= retry_until) {
+		/* Has it gone just too far? */
+		tcp_write_err(sk);
+		return 1;
+	}
+	return 0;
+}
+
+static void tcp_delack_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */
+		tp->ack.blocked = 1;
+		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
+		sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN);
+		goto out_unlock;
+	}
+
+	sk_stream_mem_reclaim(sk);
+
+	if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER))
+		goto out;
+
+	if (time_after(tp->ack.timeout, jiffies)) {
+		sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout);
+		goto out;
+	}
+	tp->ack.pending &= ~TCP_ACK_TIMER;
+
+	if (skb_queue_len(&tp->ucopy.prequeue)) {
+		struct sk_buff *skb;
+
+		NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, 
+				 skb_queue_len(&tp->ucopy.prequeue));
+
+		while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
+			sk->sk_backlog_rcv(sk, skb);
+
+		tp->ucopy.memory = 0;
+	}
+
+	if (tcp_ack_scheduled(tp)) {
+		if (!tp->ack.pingpong) {
+			/* Delayed ACK missed: inflate ATO. */
+			tp->ack.ato = min(tp->ack.ato << 1, tp->rto);
+		} else {
+			/* Delayed ACK missed: leave pingpong mode and
+			 * deflate ATO.
+			 */
+			tp->ack.pingpong = 0;
+			tp->ack.ato = TCP_ATO_MIN;
+		}
+		tcp_send_ack(sk);
+		NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
+	}
+	TCP_CHECK_TIMER(sk);
+
+out:
+	if (tcp_memory_pressure)
+		sk_stream_mem_reclaim(sk);
+out_unlock:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+static void tcp_probe_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	int max_probes;
+
+	if (tp->packets_out || !sk->sk_send_head) {
+		tp->probes_out = 0;
+		return;
+	}
+
+	/* *WARNING* RFC 1122 forbids this
+	 *
+	 * It doesn't AFAIK, because we kill the retransmit timer -AK
+	 *
+	 * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
+	 * this behaviour in Solaris down as a bug fix. [AC]
+	 *
+	 * Let me to explain. probes_out is zeroed by incoming ACKs
+	 * even if they advertise zero window. Hence, connection is killed only
+	 * if we received no ACKs for normal connection timeout. It is not killed
+	 * only because window stays zero for some time, window may be zero
+	 * until armageddon and even later. We are in full accordance
+	 * with RFCs, only probe timer combines both retransmission timeout
+	 * and probe timeout in one bottle.				--ANK
+	 */
+	max_probes = sysctl_tcp_retries2;
+
+	if (sock_flag(sk, SOCK_DEAD)) {
+		int alive = ((tp->rto<<tp->backoff) < TCP_RTO_MAX);
+ 
+		max_probes = tcp_orphan_retries(sk, alive);
+
+		if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
+			return;
+	}
+
+	if (tp->probes_out > max_probes) {
+		tcp_write_err(sk);
+	} else {
+		/* Only send another probe if we didn't close things up. */
+		tcp_send_probe0(sk);
+	}
+}
+
+/*
+ *	The TCP retransmit timer.
+ */
+
+static void tcp_retransmit_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!tp->packets_out)
+		goto out;
+
+	BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
+
+	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
+	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
+		/* Receiver dastardly shrinks window. Our retransmits
+		 * become zero probes, but we should not timeout this
+		 * connection. If the socket is an orphan, time it out,
+		 * we cannot allow such beasts to hang infinitely.
+		 */
+#ifdef TCP_DEBUG
+		if (net_ratelimit()) {
+			struct inet_sock *inet = inet_sk(sk);
+			printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
+			       NIPQUAD(inet->daddr), htons(inet->dport),
+			       inet->num, tp->snd_una, tp->snd_nxt);
+		}
+#endif
+		if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
+			tcp_write_err(sk);
+			goto out;
+		}
+		tcp_enter_loss(sk, 0);
+		tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
+		__sk_dst_reset(sk);
+		goto out_reset_timer;
+	}
+
+	if (tcp_write_timeout(sk))
+		goto out;
+
+	if (tp->retransmits == 0) {
+		if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
+			if (tp->rx_opt.sack_ok) {
+				if (tp->ca_state == TCP_CA_Recovery)
+					NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
+				else
+					NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
+			} else {
+				if (tp->ca_state == TCP_CA_Recovery)
+					NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
+				else
+					NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
+			}
+		} else if (tp->ca_state == TCP_CA_Loss) {
+			NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
+		} else {
+			NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
+		}
+	}
+
+	if (tcp_use_frto(sk)) {
+		tcp_enter_frto(sk);
+	} else {
+		tcp_enter_loss(sk, 0);
+	}
+
+	if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
+		/* Retransmission failed because of local congestion,
+		 * do not backoff.
+		 */
+		if (!tp->retransmits)
+			tp->retransmits=1;
+		tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS,
+				     min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL));
+		goto out;
+	}
+
+	/* Increase the timeout each time we retransmit.  Note that
+	 * we do not increase the rtt estimate.  rto is initialized
+	 * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
+	 * that doubling rto each time is the least we can get away with.
+	 * In KA9Q, Karn uses this for the first few times, and then
+	 * goes to quadratic.  netBSD doubles, but only goes up to *64,
+	 * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
+	 * defined in the protocol as the maximum possible RTT.  I guess
+	 * we'll have to use something other than TCP to talk to the
+	 * University of Mars.
+	 *
+	 * PAWS allows us longer timeouts and large windows, so once
+	 * implemented ftp to mars will work nicely. We will have to fix
+	 * the 120 second clamps though!
+	 */
+	tp->backoff++;
+	tp->retransmits++;
+
+out_reset_timer:
+	tp->rto = min(tp->rto << 1, TCP_RTO_MAX);
+	tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+	if (tp->retransmits > sysctl_tcp_retries1)
+		__sk_dst_reset(sk);
+
+out:;
+}
+
+static void tcp_write_timer(unsigned long data)
+{
+	struct sock *sk = (struct sock*)data;
+	struct tcp_sock *tp = tcp_sk(sk);
+	int event;
+
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later */
+		sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20));
+		goto out_unlock;
+	}
+
+	if (sk->sk_state == TCP_CLOSE || !tp->pending)
+		goto out;
+
+	if (time_after(tp->timeout, jiffies)) {
+		sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout);
+		goto out;
+	}
+
+	event = tp->pending;
+	tp->pending = 0;
+
+	switch (event) {
+	case TCP_TIME_RETRANS:
+		tcp_retransmit_timer(sk);
+		break;
+	case TCP_TIME_PROBE0:
+		tcp_probe_timer(sk);
+		break;
+	}
+	TCP_CHECK_TIMER(sk);
+
+out:
+	sk_stream_mem_reclaim(sk);
+out_unlock:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+/*
+ *	Timer for listening sockets
+ */
+
+static void tcp_synack_timer(struct sock *sk)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_listen_opt *lopt = tp->listen_opt;
+	int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries;
+	int thresh = max_retries;
+	unsigned long now = jiffies;
+	struct open_request **reqp, *req;
+	int i, budget;
+
+	if (lopt == NULL || lopt->qlen == 0)
+		return;
+
+	/* Normally all the openreqs are young and become mature
+	 * (i.e. converted to established socket) for first timeout.
+	 * If synack was not acknowledged for 3 seconds, it means
+	 * one of the following things: synack was lost, ack was lost,
+	 * rtt is high or nobody planned to ack (i.e. synflood).
+	 * When server is a bit loaded, queue is populated with old
+	 * open requests, reducing effective size of queue.
+	 * When server is well loaded, queue size reduces to zero
+	 * after several minutes of work. It is not synflood,
+	 * it is normal operation. The solution is pruning
+	 * too old entries overriding normal timeout, when
+	 * situation becomes dangerous.
+	 *
+	 * Essentially, we reserve half of room for young
+	 * embrions; and abort old ones without pity, if old
+	 * ones are about to clog our table.
+	 */
+	if (lopt->qlen>>(lopt->max_qlen_log-1)) {
+		int young = (lopt->qlen_young<<1);
+
+		while (thresh > 2) {
+			if (lopt->qlen < young)
+				break;
+			thresh--;
+			young <<= 1;
+		}
+	}
+
+	if (tp->defer_accept)
+		max_retries = tp->defer_accept;
+
+	budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
+	i = lopt->clock_hand;
+
+	do {
+		reqp=&lopt->syn_table[i];
+		while ((req = *reqp) != NULL) {
+			if (time_after_eq(now, req->expires)) {
+				if ((req->retrans < thresh ||
+				     (req->acked && req->retrans < max_retries))
+				    && !req->class->rtx_syn_ack(sk, req, NULL)) {
+					unsigned long timeo;
+
+					if (req->retrans++ == 0)
+						lopt->qlen_young--;
+					timeo = min((TCP_TIMEOUT_INIT << req->retrans),
+						    TCP_RTO_MAX);
+					req->expires = now + timeo;
+					reqp = &req->dl_next;
+					continue;
+				}
+
+				/* Drop this request */
+				write_lock(&tp->syn_wait_lock);
+				*reqp = req->dl_next;
+				write_unlock(&tp->syn_wait_lock);
+				lopt->qlen--;
+				if (req->retrans == 0)
+					lopt->qlen_young--;
+				tcp_openreq_free(req);
+				continue;
+			}
+			reqp = &req->dl_next;
+		}
+
+		i = (i+1)&(TCP_SYNQ_HSIZE-1);
+
+	} while (--budget > 0);
+
+	lopt->clock_hand = i;
+
+	if (lopt->qlen)
+		tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
+}
+
+void tcp_delete_keepalive_timer (struct sock *sk)
+{
+	sk_stop_timer(sk, &sk->sk_timer);
+}
+
+void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len)
+{
+	sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
+}
+
+void tcp_set_keepalive(struct sock *sk, int val)
+{
+	if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
+		return;
+
+	if (val && !sock_flag(sk, SOCK_KEEPOPEN))
+		tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
+	else if (!val)
+		tcp_delete_keepalive_timer(sk);
+}
+
+
+static void tcp_keepalive_timer (unsigned long data)
+{
+	struct sock *sk = (struct sock *) data;
+	struct tcp_sock *tp = tcp_sk(sk);
+	__u32 elapsed;
+
+	/* Only process if socket is not in use. */
+	bh_lock_sock(sk);
+	if (sock_owned_by_user(sk)) {
+		/* Try again later. */ 
+		tcp_reset_keepalive_timer (sk, HZ/20);
+		goto out;
+	}
+
+	if (sk->sk_state == TCP_LISTEN) {
+		tcp_synack_timer(sk);
+		goto out;
+	}
+
+	if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+		if (tp->linger2 >= 0) {
+			int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN;
+
+			if (tmo > 0) {
+				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
+				goto out;
+			}
+		}
+		tcp_send_active_reset(sk, GFP_ATOMIC);
+		goto death;
+	}
+
+	if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
+		goto out;
+
+	elapsed = keepalive_time_when(tp);
+
+	/* It is alive without keepalive 8) */
+	if (tp->packets_out || sk->sk_send_head)
+		goto resched;
+
+	elapsed = tcp_time_stamp - tp->rcv_tstamp;
+
+	if (elapsed >= keepalive_time_when(tp)) {
+		if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
+		     (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
+			tcp_send_active_reset(sk, GFP_ATOMIC);
+			tcp_write_err(sk);
+			goto out;
+		}
+		if (tcp_write_wakeup(sk) <= 0) {
+			tp->probes_out++;
+			elapsed = keepalive_intvl_when(tp);
+		} else {
+			/* If keepalive was lost due to local congestion,
+			 * try harder.
+			 */
+			elapsed = TCP_RESOURCE_PROBE_INTERVAL;
+		}
+	} else {
+		/* It is tp->rcv_tstamp + keepalive_time_when(tp) */
+		elapsed = keepalive_time_when(tp) - elapsed;
+	}
+
+	TCP_CHECK_TIMER(sk);
+	sk_stream_mem_reclaim(sk);
+
+resched:
+	tcp_reset_keepalive_timer (sk, elapsed);
+	goto out;
+
+death:	
+	tcp_done(sk);
+
+out:
+	bh_unlock_sock(sk);
+	sock_put(sk);
+}
+
+EXPORT_SYMBOL(tcp_clear_xmit_timers);
+EXPORT_SYMBOL(tcp_delete_keepalive_timer);
+EXPORT_SYMBOL(tcp_init_xmit_timers);
+EXPORT_SYMBOL(tcp_reset_keepalive_timer);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
new file mode 100644
index 000000000000..6baddfbedca3
--- /dev/null
+++ b/net/ipv4/udp.c
@@ -0,0 +1,1575 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		The User Datagram Protocol (UDP).
+ *
+ * Version:	$Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $
+ *
+ * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
+ *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ *		Alan Cox, <Alan.Cox@linux.org>
+ *		Hirokazu Takahashi, <taka@valinux.co.jp>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area() calls
+ *		Alan Cox	: 	stopped close while in use off icmp
+ *					messages. Not a fix but a botch that
+ *					for udp at least is 'valid'.
+ *		Alan Cox	:	Fixed icmp handling properly
+ *		Alan Cox	: 	Correct error for oversized datagrams
+ *		Alan Cox	:	Tidied select() semantics. 
+ *		Alan Cox	:	udp_err() fixed properly, also now 
+ *					select and read wake correctly on errors
+ *		Alan Cox	:	udp_send verify_area moved to avoid mem leak
+ *		Alan Cox	:	UDP can count its memory
+ *		Alan Cox	:	send to an unknown connection causes
+ *					an ECONNREFUSED off the icmp, but
+ *					does NOT close.
+ *		Alan Cox	:	Switched to new sk_buff handlers. No more backlog!
+ *		Alan Cox	:	Using generic datagram code. Even smaller and the PEEK
+ *					bug no longer crashes it.
+ *		Fred Van Kempen	: 	Net2e support for sk->broadcast.
+ *		Alan Cox	:	Uses skb_free_datagram
+ *		Alan Cox	:	Added get/set sockopt support.
+ *		Alan Cox	:	Broadcasting without option set returns EACCES.
+ *		Alan Cox	:	No wakeup calls. Instead we now use the callbacks.
+ *		Alan Cox	:	Use ip_tos and ip_ttl
+ *		Alan Cox	:	SNMP Mibs
+ *		Alan Cox	:	MSG_DONTROUTE, and 0.0.0.0 support.
+ *		Matt Dillon	:	UDP length checks.
+ *		Alan Cox	:	Smarter af_inet used properly.
+ *		Alan Cox	:	Use new kernel side addressing.
+ *		Alan Cox	:	Incorrect return on truncated datagram receive.
+ *	Arnt Gulbrandsen 	:	New udp_send and stuff
+ *		Alan Cox	:	Cache last socket
+ *		Alan Cox	:	Route cache
+ *		Jon Peatfield	:	Minor efficiency fix to sendto().
+ *		Mike Shaver	:	RFC1122 checks.
+ *		Alan Cox	:	Nonblocking error fix.
+ *	Willy Konynenberg	:	Transparent proxying support.
+ *		Mike McLagan	:	Routing by source
+ *		David S. Miller	:	New socket lookup architecture.
+ *					Last socket cache retained as it
+ *					does have a high hit rate.
+ *		Olaf Kirch	:	Don't linearise iovec on sendmsg.
+ *		Andi Kleen	:	Some cleanups, cache destination entry
+ *					for connect. 
+ *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
+ *		Melvin Smith	:	Check msg_name not msg_namelen in sendto(),
+ *					return ENOTCONN for unconnected sockets (POSIX)
+ *		Janos Farkas	:	don't deliver multi/broadcasts to a different
+ *					bound-to-device socket
+ *	Hirokazu Takahashi	:	HW checksumming for outgoing UDP
+ *					datagrams.
+ *	Hirokazu Takahashi	:	sendfile() on UDP works now.
+ *		Arnaldo C. Melo :	convert /proc/net/udp to seq_file
+ *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
+ *	Alexey Kuznetsov:		allow both IPv4 and IPv6 sockets to bind
+ *					a single port at the same time.
+ *	Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ *
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+ 
+#include <asm/system.h>
+#include <asm/uaccess.h>
+#include <asm/ioctls.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/sockios.h>
+#include <linux/in.h>
+#include <linux/errno.h>
+#include <linux/timer.h>
+#include <linux/mm.h>
+#include <linux/config.h>
+#include <linux/inet.h>
+#include <linux/ipv6.h>
+#include <linux/netdevice.h>
+#include <net/snmp.h>
+#include <net/tcp.h>
+#include <net/protocol.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/sock.h>
+#include <net/udp.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/inet_common.h>
+#include <net/checksum.h>
+#include <net/xfrm.h>
+
+/*
+ *	Snmp MIB for the UDP layer
+ */
+
+DEFINE_SNMP_STAT(struct udp_mib, udp_statistics);
+
+struct hlist_head udp_hash[UDP_HTABLE_SIZE];
+DEFINE_RWLOCK(udp_hash_lock);
+
+/* Shared by v4/v6 udp. */
+int udp_port_rover;
+
+static int udp_v4_get_port(struct sock *sk, unsigned short snum)
+{
+	struct hlist_node *node;
+	struct sock *sk2;
+	struct inet_sock *inet = inet_sk(sk);
+
+	write_lock_bh(&udp_hash_lock);
+	if (snum == 0) {
+		int best_size_so_far, best, result, i;
+
+		if (udp_port_rover > sysctl_local_port_range[1] ||
+		    udp_port_rover < sysctl_local_port_range[0])
+			udp_port_rover = sysctl_local_port_range[0];
+		best_size_so_far = 32767;
+		best = result = udp_port_rover;
+		for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) {
+			struct hlist_head *list;
+			int size;
+
+			list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)];
+			if (hlist_empty(list)) {
+				if (result > sysctl_local_port_range[1])
+					result = sysctl_local_port_range[0] +
+						((result - sysctl_local_port_range[0]) &
+						 (UDP_HTABLE_SIZE - 1));
+				goto gotit;
+			}
+			size = 0;
+			sk_for_each(sk2, node, list)
+				if (++size >= best_size_so_far)
+					goto next;
+			best_size_so_far = size;
+			best = result;
+		next:;
+		}
+		result = best;
+		for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) {
+			if (result > sysctl_local_port_range[1])
+				result = sysctl_local_port_range[0]
+					+ ((result - sysctl_local_port_range[0]) &
+					   (UDP_HTABLE_SIZE - 1));
+			if (!udp_lport_inuse(result))
+				break;
+		}
+		if (i >= (1 << 16) / UDP_HTABLE_SIZE)
+			goto fail;
+gotit:
+		udp_port_rover = snum = result;
+	} else {
+		sk_for_each(sk2, node,
+			    &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) {
+			struct inet_sock *inet2 = inet_sk(sk2);
+
+			if (inet2->num == snum &&
+			    sk2 != sk &&
+			    !ipv6_only_sock(sk2) &&
+			    (!sk2->sk_bound_dev_if ||
+			     !sk->sk_bound_dev_if ||
+			     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+			    (!inet2->rcv_saddr ||
+			     !inet->rcv_saddr ||
+			     inet2->rcv_saddr == inet->rcv_saddr) &&
+			    (!sk2->sk_reuse || !sk->sk_reuse))
+				goto fail;
+		}
+	}
+	inet->num = snum;
+	if (sk_unhashed(sk)) {
+		struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)];
+
+		sk_add_node(sk, h);
+		sock_prot_inc_use(sk->sk_prot);
+	}
+	write_unlock_bh(&udp_hash_lock);
+	return 0;
+
+fail:
+	write_unlock_bh(&udp_hash_lock);
+	return 1;
+}
+
+static void udp_v4_hash(struct sock *sk)
+{
+	BUG();
+}
+
+static void udp_v4_unhash(struct sock *sk)
+{
+	write_lock_bh(&udp_hash_lock);
+	if (sk_del_node_init(sk)) {
+		inet_sk(sk)->num = 0;
+		sock_prot_dec_use(sk->sk_prot);
+	}
+	write_unlock_bh(&udp_hash_lock);
+}
+
+/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
+ * harder than this. -DaveM
+ */
+static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport,
+					  u32 daddr, u16 dport, int dif)
+{
+	struct sock *sk, *result = NULL;
+	struct hlist_node *node;
+	unsigned short hnum = ntohs(dport);
+	int badness = -1;
+
+	sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) {
+		struct inet_sock *inet = inet_sk(sk);
+
+		if (inet->num == hnum && !ipv6_only_sock(sk)) {
+			int score = (sk->sk_family == PF_INET ? 1 : 0);
+			if (inet->rcv_saddr) {
+				if (inet->rcv_saddr != daddr)
+					continue;
+				score+=2;
+			}
+			if (inet->daddr) {
+				if (inet->daddr != saddr)
+					continue;
+				score+=2;
+			}
+			if (inet->dport) {
+				if (inet->dport != sport)
+					continue;
+				score+=2;
+			}
+			if (sk->sk_bound_dev_if) {
+				if (sk->sk_bound_dev_if != dif)
+					continue;
+				score+=2;
+			}
+			if(score == 9) {
+				result = sk;
+				break;
+			} else if(score > badness) {
+				result = sk;
+				badness = score;
+			}
+		}
+	}
+	return result;
+}
+
+static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport,
+					     u32 daddr, u16 dport, int dif)
+{
+	struct sock *sk;
+
+	read_lock(&udp_hash_lock);
+	sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif);
+	if (sk)
+		sock_hold(sk);
+	read_unlock(&udp_hash_lock);
+	return sk;
+}
+
+static inline struct sock *udp_v4_mcast_next(struct sock *sk,
+					     u16 loc_port, u32 loc_addr,
+					     u16 rmt_port, u32 rmt_addr,
+					     int dif)
+{
+	struct hlist_node *node;
+	struct sock *s = sk;
+	unsigned short hnum = ntohs(loc_port);
+
+	sk_for_each_from(s, node) {
+		struct inet_sock *inet = inet_sk(s);
+
+		if (inet->num != hnum					||
+		    (inet->daddr && inet->daddr != rmt_addr)		||
+		    (inet->dport != rmt_port && inet->dport)		||
+		    (inet->rcv_saddr && inet->rcv_saddr != loc_addr)	||
+		    ipv6_only_sock(s)					||
+		    (s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
+			continue;
+		if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
+			continue;
+		goto found;
+  	}
+	s = NULL;
+found:
+  	return s;
+}
+
+/*
+ * This routine is called by the ICMP module when it gets some
+ * sort of error condition.  If err < 0 then the socket should
+ * be closed and the error returned to the user.  If err > 0
+ * it's just the icmp type << 8 | icmp code.  
+ * Header points to the ip header of the error packet. We move
+ * on past this. Then (as it used to claim before adjustment)
+ * header points to the first 8 bytes of the udp header.  We need
+ * to find the appropriate port.
+ */
+
+void udp_err(struct sk_buff *skb, u32 info)
+{
+	struct inet_sock *inet;
+	struct iphdr *iph = (struct iphdr*)skb->data;
+	struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2));
+	int type = skb->h.icmph->type;
+	int code = skb->h.icmph->code;
+	struct sock *sk;
+	int harderr;
+	int err;
+
+	sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex);
+	if (sk == NULL) {
+		ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
+    	  	return;	/* No socket for error */
+	}
+
+	err = 0;
+	harderr = 0;
+	inet = inet_sk(sk);
+
+	switch (type) {
+	default:
+	case ICMP_TIME_EXCEEDED:
+		err = EHOSTUNREACH;
+		break;
+	case ICMP_SOURCE_QUENCH:
+		goto out;
+	case ICMP_PARAMETERPROB:
+		err = EPROTO;
+		harderr = 1;
+		break;
+	case ICMP_DEST_UNREACH:
+		if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */
+			if (inet->pmtudisc != IP_PMTUDISC_DONT) {
+				err = EMSGSIZE;
+				harderr = 1;
+				break;
+			}
+			goto out;
+		}
+		err = EHOSTUNREACH;
+		if (code <= NR_ICMP_UNREACH) {
+			harderr = icmp_err_convert[code].fatal;
+			err = icmp_err_convert[code].errno;
+		}
+		break;
+	}
+
+	/*
+	 *      RFC1122: OK.  Passes ICMP errors back to application, as per 
+	 *	4.1.3.3.
+	 */
+	if (!inet->recverr) {
+		if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+			goto out;
+	} else {
+		ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1));
+	}
+	sk->sk_err = err;
+	sk->sk_error_report(sk);
+out:
+	sock_put(sk);
+}
+
+/*
+ * Throw away all pending data and cancel the corking. Socket is locked.
+ */
+static void udp_flush_pending_frames(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	if (up->pending) {
+		up->len = 0;
+		up->pending = 0;
+		ip_flush_pending_frames(sk);
+	}
+}
+
+/*
+ * Push out all pending data as one UDP datagram. Socket is locked.
+ */
+static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct flowi *fl = &inet->cork.fl;
+	struct sk_buff *skb;
+	struct udphdr *uh;
+	int err = 0;
+
+	/* Grab the skbuff where UDP header space exists. */
+	if ((skb = skb_peek(&sk->sk_write_queue)) == NULL)
+		goto out;
+
+	/*
+	 * Create a UDP header
+	 */
+	uh = skb->h.uh;
+	uh->source = fl->fl_ip_sport;
+	uh->dest = fl->fl_ip_dport;
+	uh->len = htons(up->len);
+	uh->check = 0;
+
+	if (sk->sk_no_check == UDP_CSUM_NOXMIT) {
+		skb->ip_summed = CHECKSUM_NONE;
+		goto send;
+	}
+
+	if (skb_queue_len(&sk->sk_write_queue) == 1) {
+		/*
+		 * Only one fragment on the socket.
+		 */
+		if (skb->ip_summed == CHECKSUM_HW) {
+			skb->csum = offsetof(struct udphdr, check);
+			uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+					up->len, IPPROTO_UDP, 0);
+		} else {
+			skb->csum = csum_partial((char *)uh,
+					sizeof(struct udphdr), skb->csum);
+			uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+					up->len, IPPROTO_UDP, skb->csum);
+			if (uh->check == 0)
+				uh->check = -1;
+		}
+	} else {
+		unsigned int csum = 0;
+		/*
+		 * HW-checksum won't work as there are two or more 
+		 * fragments on the socket so that all csums of sk_buffs
+		 * should be together.
+		 */
+		if (skb->ip_summed == CHECKSUM_HW) {
+			int offset = (unsigned char *)uh - skb->data;
+			skb->csum = skb_checksum(skb, offset, skb->len - offset, 0);
+
+			skb->ip_summed = CHECKSUM_NONE;
+		} else {
+			skb->csum = csum_partial((char *)uh,
+					sizeof(struct udphdr), skb->csum);
+		}
+
+		skb_queue_walk(&sk->sk_write_queue, skb) {
+			csum = csum_add(csum, skb->csum);
+		}
+		uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst,
+				up->len, IPPROTO_UDP, csum);
+		if (uh->check == 0)
+			uh->check = -1;
+	}
+send:
+	err = ip_push_pending_frames(sk);
+out:
+	up->len = 0;
+	up->pending = 0;
+	return err;
+}
+
+
+static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base)
+{
+	return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base));
+}
+
+int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		size_t len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	struct udp_sock *up = udp_sk(sk);
+	int ulen = len;
+	struct ipcm_cookie ipc;
+	struct rtable *rt = NULL;
+	int free = 0;
+	int connected = 0;
+	u32 daddr, faddr, saddr;
+	u16 dport;
+	u8  tos;
+	int err;
+	int corkreq = up->corkflag || msg->msg_flags&MSG_MORE;
+
+	if (len > 0xFFFF)
+		return -EMSGSIZE;
+
+	/* 
+	 *	Check the flags.
+	 */
+
+	if (msg->msg_flags&MSG_OOB)	/* Mirror BSD error message compatibility */
+		return -EOPNOTSUPP;
+
+	ipc.opt = NULL;
+
+	if (up->pending) {
+		/*
+		 * There are pending frames.
+	 	 * The socket lock must be held while it's corked.
+		 */
+		lock_sock(sk);
+		if (likely(up->pending)) {
+			if (unlikely(up->pending != AF_INET)) {
+				release_sock(sk);
+				return -EINVAL;
+			}
+ 			goto do_append_data;
+		}
+		release_sock(sk);
+	}
+	ulen += sizeof(struct udphdr);
+
+	/*
+	 *	Get and verify the address. 
+	 */
+	if (msg->msg_name) {
+		struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name;
+		if (msg->msg_namelen < sizeof(*usin))
+			return -EINVAL;
+		if (usin->sin_family != AF_INET) {
+			if (usin->sin_family != AF_UNSPEC)
+				return -EAFNOSUPPORT;
+		}
+
+		daddr = usin->sin_addr.s_addr;
+		dport = usin->sin_port;
+		if (dport == 0)
+			return -EINVAL;
+	} else {
+		if (sk->sk_state != TCP_ESTABLISHED)
+			return -EDESTADDRREQ;
+		daddr = inet->daddr;
+		dport = inet->dport;
+		/* Open fast path for connected socket.
+		   Route will not be used, if at least one option is set.
+		 */
+		connected = 1;
+  	}
+	ipc.addr = inet->saddr;
+
+	ipc.oif = sk->sk_bound_dev_if;
+	if (msg->msg_controllen) {
+		err = ip_cmsg_send(msg, &ipc);
+		if (err)
+			return err;
+		if (ipc.opt)
+			free = 1;
+		connected = 0;
+	}
+	if (!ipc.opt)
+		ipc.opt = inet->opt;
+
+	saddr = ipc.addr;
+	ipc.addr = faddr = daddr;
+
+	if (ipc.opt && ipc.opt->srr) {
+		if (!daddr)
+			return -EINVAL;
+		faddr = ipc.opt->faddr;
+		connected = 0;
+	}
+	tos = RT_TOS(inet->tos);
+	if (sock_flag(sk, SOCK_LOCALROUTE) ||
+	    (msg->msg_flags & MSG_DONTROUTE) || 
+	    (ipc.opt && ipc.opt->is_strictroute)) {
+		tos |= RTO_ONLINK;
+		connected = 0;
+	}
+
+	if (MULTICAST(daddr)) {
+		if (!ipc.oif)
+			ipc.oif = inet->mc_index;
+		if (!saddr)
+			saddr = inet->mc_addr;
+		connected = 0;
+	}
+
+	if (connected)
+		rt = (struct rtable*)sk_dst_check(sk, 0);
+
+	if (rt == NULL) {
+		struct flowi fl = { .oif = ipc.oif,
+				    .nl_u = { .ip4_u =
+					      { .daddr = faddr,
+						.saddr = saddr,
+						.tos = tos } },
+				    .proto = IPPROTO_UDP,
+				    .uli_u = { .ports =
+					       { .sport = inet->sport,
+						 .dport = dport } } };
+		err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT));
+		if (err)
+			goto out;
+
+		err = -EACCES;
+		if ((rt->rt_flags & RTCF_BROADCAST) &&
+		    !sock_flag(sk, SOCK_BROADCAST))
+			goto out;
+		if (connected)
+			sk_dst_set(sk, dst_clone(&rt->u.dst));
+	}
+
+	if (msg->msg_flags&MSG_CONFIRM)
+		goto do_confirm;
+back_from_confirm:
+
+	saddr = rt->rt_src;
+	if (!ipc.addr)
+		daddr = ipc.addr = rt->rt_dst;
+
+	lock_sock(sk);
+	if (unlikely(up->pending)) {
+		/* The socket is already corked while preparing it. */
+		/* ... which is an evident application bug. --ANK */
+		release_sock(sk);
+
+		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n"));
+		err = -EINVAL;
+		goto out;
+	}
+	/*
+	 *	Now cork the socket to pend data.
+	 */
+	inet->cork.fl.fl4_dst = daddr;
+	inet->cork.fl.fl_ip_dport = dport;
+	inet->cork.fl.fl4_src = saddr;
+	inet->cork.fl.fl_ip_sport = inet->sport;
+	up->pending = AF_INET;
+
+do_append_data:
+	up->len += ulen;
+	err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, 
+			sizeof(struct udphdr), &ipc, rt, 
+			corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags);
+	if (err)
+		udp_flush_pending_frames(sk);
+	else if (!corkreq)
+		err = udp_push_pending_frames(sk, up);
+	release_sock(sk);
+
+out:
+	ip_rt_put(rt);
+	if (free)
+		kfree(ipc.opt);
+	if (!err) {
+		UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS);
+		return len;
+	}
+	return err;
+
+do_confirm:
+	dst_confirm(&rt->u.dst);
+	if (!(msg->msg_flags&MSG_PROBE) || len)
+		goto back_from_confirm;
+	err = 0;
+	goto out;
+}
+
+static int udp_sendpage(struct sock *sk, struct page *page, int offset,
+			size_t size, int flags)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int ret;
+
+	if (!up->pending) {
+		struct msghdr msg = {	.msg_flags = flags|MSG_MORE };
+
+		/* Call udp_sendmsg to specify destination address which
+		 * sendpage interface can't pass.
+		 * This will succeed only when the socket is connected.
+		 */
+		ret = udp_sendmsg(NULL, sk, &msg, 0);
+		if (ret < 0)
+			return ret;
+	}
+
+	lock_sock(sk);
+
+	if (unlikely(!up->pending)) {
+		release_sock(sk);
+
+		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n"));
+		return -EINVAL;
+	}
+
+	ret = ip_append_page(sk, page, offset, size, flags);
+	if (ret == -EOPNOTSUPP) {
+		release_sock(sk);
+		return sock_no_sendpage(sk->sk_socket, page, offset,
+					size, flags);
+	}
+	if (ret < 0) {
+		udp_flush_pending_frames(sk);
+		goto out;
+	}
+
+	up->len += size;
+	if (!(up->corkflag || (flags&MSG_MORE)))
+		ret = udp_push_pending_frames(sk, up);
+	if (!ret)
+		ret = size;
+out:
+	release_sock(sk);
+	return ret;
+}
+
+/*
+ *	IOCTL requests applicable to the UDP protocol
+ */
+ 
+int udp_ioctl(struct sock *sk, int cmd, unsigned long arg)
+{
+	switch(cmd) 
+	{
+		case SIOCOUTQ:
+		{
+			int amount = atomic_read(&sk->sk_wmem_alloc);
+			return put_user(amount, (int __user *)arg);
+		}
+
+		case SIOCINQ:
+		{
+			struct sk_buff *skb;
+			unsigned long amount;
+
+			amount = 0;
+			spin_lock_irq(&sk->sk_receive_queue.lock);
+			skb = skb_peek(&sk->sk_receive_queue);
+			if (skb != NULL) {
+				/*
+				 * We will only return the amount
+				 * of this packet since that is all
+				 * that will be read.
+				 */
+				amount = skb->len - sizeof(struct udphdr);
+			}
+			spin_unlock_irq(&sk->sk_receive_queue.lock);
+			return put_user(amount, (int __user *)arg);
+		}
+
+		default:
+			return -ENOIOCTLCMD;
+	}
+	return(0);
+}
+
+static __inline__ int __udp_checksum_complete(struct sk_buff *skb)
+{
+	return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum));
+}
+
+static __inline__ int udp_checksum_complete(struct sk_buff *skb)
+{
+	return skb->ip_summed != CHECKSUM_UNNECESSARY &&
+		__udp_checksum_complete(skb);
+}
+
+/*
+ * 	This should be easy, if there is something there we
+ * 	return it, otherwise we block.
+ */
+
+static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
+		       size_t len, int noblock, int flags, int *addr_len)
+{
+	struct inet_sock *inet = inet_sk(sk);
+  	struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
+  	struct sk_buff *skb;
+  	int copied, err;
+
+	/*
+	 *	Check any passed addresses
+	 */
+	if (addr_len)
+		*addr_len=sizeof(*sin);
+
+	if (flags & MSG_ERRQUEUE)
+		return ip_recv_error(sk, msg, len);
+
+try_again:
+	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	if (!skb)
+		goto out;
+  
+  	copied = skb->len - sizeof(struct udphdr);
+	if (copied > len) {
+		copied = len;
+		msg->msg_flags |= MSG_TRUNC;
+	}
+
+	if (skb->ip_summed==CHECKSUM_UNNECESSARY) {
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else if (msg->msg_flags&MSG_TRUNC) {
+		if (__udp_checksum_complete(skb))
+			goto csum_copy_err;
+		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov,
+					      copied);
+	} else {
+		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
+
+		if (err == -EINVAL)
+			goto csum_copy_err;
+	}
+
+	if (err)
+		goto out_free;
+
+	sock_recv_timestamp(msg, sk, skb);
+
+	/* Copy the address. */
+	if (sin)
+	{
+		sin->sin_family = AF_INET;
+		sin->sin_port = skb->h.uh->source;
+		sin->sin_addr.s_addr = skb->nh.iph->saddr;
+		memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+  	}
+	if (inet->cmsg_flags)
+		ip_cmsg_recv(msg, skb);
+
+	err = copied;
+	if (flags & MSG_TRUNC)
+		err = skb->len - sizeof(struct udphdr);
+  
+out_free:
+  	skb_free_datagram(sk, skb);
+out:
+  	return err;
+
+csum_copy_err:
+	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+
+	/* Clear queue. */
+	if (flags&MSG_PEEK) {
+		int clear = 0;
+		spin_lock_irq(&sk->sk_receive_queue.lock);
+		if (skb == skb_peek(&sk->sk_receive_queue)) {
+			__skb_unlink(skb, &sk->sk_receive_queue);
+			clear = 1;
+		}
+		spin_unlock_irq(&sk->sk_receive_queue.lock);
+		if (clear)
+			kfree_skb(skb);
+	}
+
+	skb_free_datagram(sk, skb);
+
+	if (noblock)
+		return -EAGAIN;	
+	goto try_again;
+}
+
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+	struct inet_sock *inet = inet_sk(sk);
+	/*
+	 *	1003.1g - break association.
+	 */
+	 
+	sk->sk_state = TCP_CLOSE;
+	inet->daddr = 0;
+	inet->dport = 0;
+	sk->sk_bound_dev_if = 0;
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		inet_reset_saddr(sk);
+
+	if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
+		sk->sk_prot->unhash(sk);
+		inet->sport = 0;
+	}
+	sk_dst_reset(sk);
+	return 0;
+}
+
+static void udp_close(struct sock *sk, long timeout)
+{
+	sk_common_release(sk);
+}
+
+/* return:
+ * 	1  if the the UDP system should process it
+ *	0  if we should drop this packet
+ * 	-1 if it should get processed by xfrm4_rcv_encap
+ */
+static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb)
+{
+#ifndef CONFIG_XFRM
+	return 1; 
+#else
+	struct udp_sock *up = udp_sk(sk);
+  	struct udphdr *uh = skb->h.uh;
+	struct iphdr *iph;
+	int iphlen, len;
+  
+	__u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr);
+	__u32 *udpdata32 = (__u32 *)udpdata;
+	__u16 encap_type = up->encap_type;
+
+	/* if we're overly short, let UDP handle it */
+	if (udpdata > skb->tail)
+		return 1;
+
+	/* if this is not encapsulated socket, then just return now */
+	if (!encap_type)
+		return 1;
+
+	len = skb->tail - udpdata;
+
+	switch (encap_type) {
+	default:
+	case UDP_ENCAP_ESPINUDP:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			return 0;
+		} else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) {
+			/* ESP Packet without Non-ESP header */
+			len = sizeof(struct udphdr);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	case UDP_ENCAP_ESPINUDP_NON_IKE:
+		/* Check if this is a keepalive packet.  If so, eat it. */
+		if (len == 1 && udpdata[0] == 0xff) {
+			return 0;
+		} else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) &&
+			   udpdata32[0] == 0 && udpdata32[1] == 0) {
+			
+			/* ESP Packet with Non-IKE marker */
+			len = sizeof(struct udphdr) + 2 * sizeof(u32);
+		} else
+			/* Must be an IKE packet.. pass it through */
+			return 1;
+		break;
+	}
+
+	/* At this point we are sure that this is an ESPinUDP packet,
+	 * so we need to remove 'len' bytes from the packet (the UDP
+	 * header and optional ESP marker bytes) and then modify the
+	 * protocol to ESP, and then call into the transform receiver.
+	 */
+
+	/* Now we can update and verify the packet length... */
+	iph = skb->nh.iph;
+	iphlen = iph->ihl << 2;
+	iph->tot_len = htons(ntohs(iph->tot_len) - len);
+	if (skb->len < iphlen + len) {
+		/* packet is too small!?! */
+		return 0;
+	}
+
+	/* pull the data buffer up to the ESP header and set the
+	 * transport header to point to ESP.  Keep UDP on the stack
+	 * for later.
+	 */
+	skb->h.raw = skb_pull(skb, len);
+
+	/* modify the protocol (it's ESP!) */
+	iph->protocol = IPPROTO_ESP;
+
+	/* and let the caller know to send this into the ESP processor... */
+	return -1;
+#endif
+}
+
+/* returns:
+ *  -1: error
+ *   0: success
+ *  >0: "udp encap" protocol resubmission
+ *
+ * Note that in the success and error cases, the skb is assumed to
+ * have either been requeued or freed.
+ */
+static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	/*
+	 *	Charge it to the socket, dropping if the queue is full.
+	 */
+	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
+		kfree_skb(skb);
+		return -1;
+	}
+
+	if (up->encap_type) {
+		/*
+		 * This is an encapsulation socket, so let's see if this is
+		 * an encapsulated packet.
+		 * If it's a keepalive packet, then just eat it.
+		 * If it's an encapsulateed packet, then pass it to the
+		 * IPsec xfrm input and return the response
+		 * appropriately.  Otherwise, just fall through and
+		 * pass this up the UDP socket.
+		 */
+		int ret;
+
+		ret = udp_encap_rcv(sk, skb);
+		if (ret == 0) {
+			/* Eat the packet .. */
+			kfree_skb(skb);
+			return 0;
+		}
+		if (ret < 0) {
+			/* process the ESP packet */
+			ret = xfrm4_rcv_encap(skb, up->encap_type);
+			UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
+			return -ret;
+		}
+		/* FALLTHROUGH -- it's a UDP Packet */
+	}
+
+	if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) {
+		if (__udp_checksum_complete(skb)) {
+			UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+			kfree_skb(skb);
+			return -1;
+		}
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	}
+
+	if (sock_queue_rcv_skb(sk,skb)<0) {
+		UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+		kfree_skb(skb);
+		return -1;
+	}
+	UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS);
+	return 0;
+}
+
+/*
+ *	Multicasts and broadcasts go to each listener.
+ *
+ *	Note: called only from the BH handler context,
+ *	so we don't need to lock the hashes.
+ */
+static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh,
+				 u32 saddr, u32 daddr)
+{
+	struct sock *sk;
+	int dif;
+
+	read_lock(&udp_hash_lock);
+	sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]);
+	dif = skb->dev->ifindex;
+	sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif);
+	if (sk) {
+		struct sock *sknext = NULL;
+
+		do {
+			struct sk_buff *skb1 = skb;
+
+			sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr,
+						   uh->source, saddr, dif);
+			if(sknext)
+				skb1 = skb_clone(skb, GFP_ATOMIC);
+
+			if(skb1) {
+				int ret = udp_queue_rcv_skb(sk, skb1);
+				if (ret > 0)
+					/* we should probably re-process instead
+					 * of dropping packets here. */
+					kfree_skb(skb1);
+			}
+			sk = sknext;
+		} while(sknext);
+	} else
+		kfree_skb(skb);
+	read_unlock(&udp_hash_lock);
+	return 0;
+}
+
+/* Initialize UDP checksum. If exited with zero value (success),
+ * CHECKSUM_UNNECESSARY means, that no more checks are required.
+ * Otherwise, csum completion requires chacksumming packet body,
+ * including udp header and folding it to skb->csum.
+ */
+static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh,
+			     unsigned short ulen, u32 saddr, u32 daddr)
+{
+	if (uh->check == 0) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+	} else if (skb->ip_summed == CHECKSUM_HW) {
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+		if (!udp_check(uh, ulen, saddr, daddr, skb->csum))
+			return 0;
+		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n"));
+		skb->ip_summed = CHECKSUM_NONE;
+	}
+	if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+		skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0);
+	/* Probably, we should checksum udp header (it should be in cache
+	 * in any case) and data in tiny packets (< rx copybreak).
+	 */
+	return 0;
+}
+
+/*
+ *	All we need to do is get the socket, and then do a checksum. 
+ */
+ 
+int udp_rcv(struct sk_buff *skb)
+{
+  	struct sock *sk;
+  	struct udphdr *uh;
+	unsigned short ulen;
+	struct rtable *rt = (struct rtable*)skb->dst;
+	u32 saddr = skb->nh.iph->saddr;
+	u32 daddr = skb->nh.iph->daddr;
+	int len = skb->len;
+
+	/*
+	 *	Validate the packet and the UDP length.
+	 */
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
+		goto no_header;
+
+	uh = skb->h.uh;
+
+	ulen = ntohs(uh->len);
+
+	if (ulen > len || ulen < sizeof(*uh))
+		goto short_packet;
+
+	if (pskb_trim(skb, ulen))
+		goto short_packet;
+
+	if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0)
+		goto csum_error;
+
+	if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST))
+		return udp_v4_mcast_deliver(skb, uh, saddr, daddr);
+
+	sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex);
+
+	if (sk != NULL) {
+		int ret = udp_queue_rcv_skb(sk, skb);
+		sock_put(sk);
+
+		/* a return value > 0 means to resubmit the input, but
+		 * it it wants the return to be -protocol, or 0
+		 */
+		if (ret > 0)
+			return -ret;
+		return 0;
+	}
+
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+		goto drop;
+
+	/* No socket. Drop packet silently, if checksum is wrong */
+	if (udp_checksum_complete(skb))
+		goto csum_error;
+
+	UDP_INC_STATS_BH(UDP_MIB_NOPORTS);
+	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+
+	/*
+	 * Hmm.  We got an UDP packet to a port to which we
+	 * don't wanna listen.  Ignore it.
+	 */
+	kfree_skb(skb);
+	return(0);
+
+short_packet:
+	NETDEBUG(if (net_ratelimit())
+		printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n",
+			NIPQUAD(saddr),
+			ntohs(uh->source),
+			ulen,
+			len,
+			NIPQUAD(daddr),
+			ntohs(uh->dest)));
+no_header:
+	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+	kfree_skb(skb);
+	return(0);
+
+csum_error:
+	/* 
+	 * RFC1122: OK.  Discards the bad packet silently (as far as 
+	 * the network is concerned, anyway) as per 4.1.3.4 (MUST). 
+	 */
+	NETDEBUG(if (net_ratelimit())
+		 printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n",
+			NIPQUAD(saddr),
+			ntohs(uh->source),
+			NIPQUAD(daddr),
+			ntohs(uh->dest),
+			ulen));
+drop:
+	UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+	kfree_skb(skb);
+	return(0);
+}
+
+static int udp_destroy_sock(struct sock *sk)
+{
+	lock_sock(sk);
+	udp_flush_pending_frames(sk);
+	release_sock(sk);
+	return 0;
+}
+
+/*
+ *	Socket option code for UDP
+ */
+static int udp_setsockopt(struct sock *sk, int level, int optname, 
+			  char __user *optval, int optlen)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int val;
+	int err = 0;
+
+	if (level != SOL_UDP)
+		return ip_setsockopt(sk, level, optname, optval, optlen);
+
+	if(optlen<sizeof(int))
+		return -EINVAL;
+
+	if (get_user(val, (int __user *)optval))
+		return -EFAULT;
+
+	switch(optname) {
+	case UDP_CORK:
+		if (val != 0) {
+			up->corkflag = 1;
+		} else {
+			up->corkflag = 0;
+			lock_sock(sk);
+			udp_push_pending_frames(sk, up);
+			release_sock(sk);
+		}
+		break;
+		
+	case UDP_ENCAP:
+		switch (val) {
+		case 0:
+		case UDP_ENCAP_ESPINUDP:
+		case UDP_ENCAP_ESPINUDP_NON_IKE:
+			up->encap_type = val;
+			break;
+		default:
+			err = -ENOPROTOOPT;
+			break;
+		}
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	};
+
+	return err;
+}
+
+static int udp_getsockopt(struct sock *sk, int level, int optname, 
+			  char __user *optval, int __user *optlen)
+{
+	struct udp_sock *up = udp_sk(sk);
+	int val, len;
+
+	if (level != SOL_UDP)
+		return ip_getsockopt(sk, level, optname, optval, optlen);
+
+	if(get_user(len,optlen))
+		return -EFAULT;
+
+	len = min_t(unsigned int, len, sizeof(int));
+	
+	if(len < 0)
+		return -EINVAL;
+
+	switch(optname) {
+	case UDP_CORK:
+		val = up->corkflag;
+		break;
+
+	case UDP_ENCAP:
+		val = up->encap_type;
+		break;
+
+	default:
+		return -ENOPROTOOPT;
+	};
+
+  	if(put_user(len, optlen))
+  		return -EFAULT;
+	if(copy_to_user(optval, &val,len))
+		return -EFAULT;
+  	return 0;
+}
+
+/**
+ * 	udp_poll - wait for a UDP event.
+ *	@file - file struct
+ *	@sock - socket
+ *	@wait - poll table
+ *
+ *	This is same as datagram poll, except for the special case of 
+ *	blocking sockets. If application is using a blocking fd
+ *	and a packet with checksum error is in the queue;
+ *	then it could get return from select indicating data available
+ *	but then block when reading it. Add special case code
+ *	to work around these arguably broken applications.
+ */
+unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
+{
+	unsigned int mask = datagram_poll(file, sock, wait);
+	struct sock *sk = sock->sk;
+	
+	/* Check for false positives due to checksum errors */
+	if ( (mask & POLLRDNORM) &&
+	     !(file->f_flags & O_NONBLOCK) &&
+	     !(sk->sk_shutdown & RCV_SHUTDOWN)){
+		struct sk_buff_head *rcvq = &sk->sk_receive_queue;
+		struct sk_buff *skb;
+
+		spin_lock_irq(&rcvq->lock);
+		while ((skb = skb_peek(rcvq)) != NULL) {
+			if (udp_checksum_complete(skb)) {
+				UDP_INC_STATS_BH(UDP_MIB_INERRORS);
+				__skb_unlink(skb, rcvq);
+				kfree_skb(skb);
+			} else {
+				skb->ip_summed = CHECKSUM_UNNECESSARY;
+				break;
+			}
+		}
+		spin_unlock_irq(&rcvq->lock);
+
+		/* nothing to see, move along */
+		if (skb == NULL)
+			mask &= ~(POLLIN | POLLRDNORM);
+	}
+
+	return mask;
+	
+}
+
+struct proto udp_prot = {
+ 	.name =		"UDP",
+	.owner =	THIS_MODULE,
+	.close =	udp_close,
+	.connect =	ip4_datagram_connect,
+	.disconnect =	udp_disconnect,
+	.ioctl =	udp_ioctl,
+	.destroy =	udp_destroy_sock,
+	.setsockopt =	udp_setsockopt,
+	.getsockopt =	udp_getsockopt,
+	.sendmsg =	udp_sendmsg,
+	.recvmsg =	udp_recvmsg,
+	.sendpage =	udp_sendpage,
+	.backlog_rcv =	udp_queue_rcv_skb,
+	.hash =		udp_v4_hash,
+	.unhash =	udp_v4_unhash,
+	.get_port =	udp_v4_get_port,
+	.obj_size =	sizeof(struct udp_sock),
+};
+
+/* ------------------------------------------------------------------------ */
+#ifdef CONFIG_PROC_FS
+
+static struct sock *udp_get_first(struct seq_file *seq)
+{
+	struct sock *sk;
+	struct udp_iter_state *state = seq->private;
+
+	for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+		struct hlist_node *node;
+		sk_for_each(sk, node, &udp_hash[state->bucket]) {
+			if (sk->sk_family == state->family)
+				goto found;
+		}
+	}
+	sk = NULL;
+found:
+	return sk;
+}
+
+static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
+{
+	struct udp_iter_state *state = seq->private;
+
+	do {
+		sk = sk_next(sk);
+try_again:
+		;
+	} while (sk && sk->sk_family != state->family);
+
+	if (!sk && ++state->bucket < UDP_HTABLE_SIZE) {
+		sk = sk_head(&udp_hash[state->bucket]);
+		goto try_again;
+	}
+	return sk;
+}
+
+static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
+{
+	struct sock *sk = udp_get_first(seq);
+
+	if (sk)
+		while(pos && (sk = udp_get_next(seq, sk)) != NULL)
+			--pos;
+	return pos ? NULL : sk;
+}
+
+static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock(&udp_hash_lock);
+	return *pos ? udp_get_idx(seq, *pos-1) : (void *)1;
+}
+
+static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct sock *sk;
+
+	if (v == (void *)1)
+		sk = udp_get_idx(seq, 0);
+	else
+		sk = udp_get_next(seq, v);
+
+	++*pos;
+	return sk;
+}
+
+static void udp_seq_stop(struct seq_file *seq, void *v)
+{
+	read_unlock(&udp_hash_lock);
+}
+
+static int udp_seq_open(struct inode *inode, struct file *file)
+{
+	struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct seq_file *seq;
+	int rc = -ENOMEM;
+	struct udp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (!s)
+		goto out;
+	memset(s, 0, sizeof(*s));
+	s->family		= afinfo->family;
+	s->seq_ops.start	= udp_seq_start;
+	s->seq_ops.next		= udp_seq_next;
+	s->seq_ops.show		= afinfo->seq_show;
+	s->seq_ops.stop		= udp_seq_stop;
+
+	rc = seq_open(file, &s->seq_ops);
+	if (rc)
+		goto out_kfree;
+
+	seq	     = file->private_data;
+	seq->private = s;
+out:
+	return rc;
+out_kfree:
+	kfree(s);
+	goto out;
+}
+
+/* ------------------------------------------------------------------------ */
+int udp_proc_register(struct udp_seq_afinfo *afinfo)
+{
+	struct proc_dir_entry *p;
+	int rc = 0;
+
+	if (!afinfo)
+		return -EINVAL;
+	afinfo->seq_fops->owner		= afinfo->owner;
+	afinfo->seq_fops->open		= udp_seq_open;
+	afinfo->seq_fops->read		= seq_read;
+	afinfo->seq_fops->llseek	= seq_lseek;
+	afinfo->seq_fops->release	= seq_release_private;
+
+	p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
+	if (p)
+		p->data = afinfo;
+	else
+		rc = -ENOMEM;
+	return rc;
+}
+
+void udp_proc_unregister(struct udp_seq_afinfo *afinfo)
+{
+	if (!afinfo)
+		return;
+	proc_net_remove(afinfo->name);
+	memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
+}
+
+/* ------------------------------------------------------------------------ */
+static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket)
+{
+	struct inet_sock *inet = inet_sk(sp);
+	unsigned int dest = inet->daddr;
+	unsigned int src  = inet->rcv_saddr;
+	__u16 destp	  = ntohs(inet->dport);
+	__u16 srcp	  = ntohs(inet->sport);
+
+	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
+		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p",
+		bucket, src, srcp, dest, destp, sp->sk_state, 
+		atomic_read(&sp->sk_wmem_alloc),
+		atomic_read(&sp->sk_rmem_alloc),
+		0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp),
+		atomic_read(&sp->sk_refcnt), sp);
+}
+
+static int udp4_seq_show(struct seq_file *seq, void *v)
+{
+	if (v == SEQ_START_TOKEN)
+		seq_printf(seq, "%-127s\n",
+			   "  sl  local_address rem_address   st tx_queue "
+			   "rx_queue tr tm->when retrnsmt   uid  timeout "
+			   "inode");
+	else {
+		char tmpbuf[129];
+		struct udp_iter_state *state = seq->private;
+
+		udp4_format_sock(v, tmpbuf, state->bucket);
+		seq_printf(seq, "%-127s\n", tmpbuf);
+	}
+	return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+static struct file_operations udp4_seq_fops;
+static struct udp_seq_afinfo udp4_seq_afinfo = {
+	.owner		= THIS_MODULE,
+	.name		= "udp",
+	.family		= AF_INET,
+	.seq_show	= udp4_seq_show,
+	.seq_fops	= &udp4_seq_fops,
+};
+
+int __init udp4_proc_init(void)
+{
+	return udp_proc_register(&udp4_seq_afinfo);
+}
+
+void udp4_proc_exit(void)
+{
+	udp_proc_unregister(&udp4_seq_afinfo);
+}
+#endif /* CONFIG_PROC_FS */
+
+EXPORT_SYMBOL(udp_disconnect);
+EXPORT_SYMBOL(udp_hash);
+EXPORT_SYMBOL(udp_hash_lock);
+EXPORT_SYMBOL(udp_ioctl);
+EXPORT_SYMBOL(udp_port_rover);
+EXPORT_SYMBOL(udp_prot);
+EXPORT_SYMBOL(udp_sendmsg);
+EXPORT_SYMBOL(udp_poll);
+
+#ifdef CONFIG_PROC_FS
+EXPORT_SYMBOL(udp_proc_register);
+EXPORT_SYMBOL(udp_proc_unregister);
+#endif
diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c
new file mode 100644
index 000000000000..6aecd7a43534
--- /dev/null
+++ b/net/ipv4/utils.c
@@ -0,0 +1,59 @@
+/*
+ * INET		An implementation of the TCP/IP protocol suite for the LINUX
+ *		operating system.  INET is implemented using the  BSD Socket
+ *		interface as the means of communication with the user level.
+ *
+ *		Various kernel-resident INET utility functions; mainly
+ *		for format conversion and debugging output.
+ *
+ * Version:	$Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $
+ *
+ * Author:	Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
+ *
+ * Fixes:
+ *		Alan Cox	:	verify_area check.
+ *		Alan Cox	:	removed old debugging.
+ *		Andi Kleen	:	add net_ratelimit()  
+ *
+ *		This program is free software; you can redistribute it and/or
+ *		modify it under the terms of the GNU General Public License
+ *		as published by the Free Software Foundation; either version
+ *		2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+/*
+ *	Convert an ASCII string to binary IP. 
+ */
+ 
+__u32 in_aton(const char *str)
+{
+	unsigned long l;
+	unsigned int val;
+	int i;
+
+	l = 0;
+	for (i = 0; i < 4; i++) 
+	{
+		l <<= 8;
+		if (*str != '\0') 
+		{
+			val = 0;
+			while (*str != '\0' && *str != '.') 
+			{
+				val *= 10;
+				val += *str - '0';
+				str++;
+			}
+			l |= val;
+			if (*str != '\0') 
+				str++;
+		}
+	}
+	return(htonl(l));
+}
+
+EXPORT_SYMBOL(in_aton);
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
new file mode 100644
index 000000000000..2d3849c38a0f
--- /dev/null
+++ b/net/ipv4/xfrm4_input.c
@@ -0,0 +1,160 @@
+/*
+ * xfrm4_input.c
+ *
+ * Changes:
+ *	YOSHIFUJI Hideaki @USAGI
+ *		Split up af-specific portion
+ *	Derek Atkins <derek@ihtfp.com>
+ *		Add Encapsulation support
+ * 	
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+
+int xfrm4_rcv(struct sk_buff *skb)
+{
+	return xfrm4_rcv_encap(skb, 0);
+}
+
+EXPORT_SYMBOL(xfrm4_rcv);
+
+static inline void ipip_ecn_decapsulate(struct sk_buff *skb)
+{
+	struct iphdr *outer_iph = skb->nh.iph;
+	struct iphdr *inner_iph = skb->h.ipiph;
+
+	if (INET_ECN_is_ce(outer_iph->tos))
+		IP_ECN_set_ce(inner_iph);
+}
+
+static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq)
+{
+	switch (nexthdr) {
+	case IPPROTO_IPIP:
+		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+			return -EINVAL;
+		*spi = skb->nh.iph->saddr;
+		*seq = 0;
+		return 0;
+	}
+
+	return xfrm_parse_spi(skb, nexthdr, spi, seq);
+}
+
+int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
+{
+	int err;
+	u32 spi, seq;
+	struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH];
+	struct xfrm_state *x;
+	int xfrm_nr = 0;
+	int decaps = 0;
+
+	if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0)
+		goto drop;
+
+	do {
+		struct iphdr *iph = skb->nh.iph;
+
+		if (xfrm_nr == XFRM_MAX_DEPTH)
+			goto drop;
+
+		x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET);
+		if (x == NULL)
+			goto drop;
+
+		spin_lock(&x->lock);
+		if (unlikely(x->km.state != XFRM_STATE_VALID))
+			goto drop_unlock;
+
+		if (x->props.replay_window && xfrm_replay_check(x, seq))
+			goto drop_unlock;
+
+		if (xfrm_state_check_expire(x))
+			goto drop_unlock;
+
+		xfrm_vec[xfrm_nr].decap.decap_type = encap_type;
+		if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb))
+			goto drop_unlock;
+
+		/* only the first xfrm gets the encap type */
+		encap_type = 0;
+
+		if (x->props.replay_window)
+			xfrm_replay_advance(x, seq);
+
+		x->curlft.bytes += skb->len;
+		x->curlft.packets++;
+
+		spin_unlock(&x->lock);
+
+		xfrm_vec[xfrm_nr++].xvec = x;
+
+		iph = skb->nh.iph;
+
+		if (x->props.mode) {
+			if (iph->protocol != IPPROTO_IPIP)
+				goto drop;
+			if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+				goto drop;
+			if (skb_cloned(skb) &&
+			    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+				goto drop;
+			if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+				ipv4_copy_dscp(iph, skb->h.ipiph);
+			if (!(x->props.flags & XFRM_STATE_NOECN))
+				ipip_ecn_decapsulate(skb);
+			skb->mac.raw = memmove(skb->data - skb->mac_len,
+					       skb->mac.raw, skb->mac_len);
+			skb->nh.raw = skb->data;
+			memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+			decaps = 1;
+			break;
+		}
+
+		if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0)
+			goto drop;
+	} while (!err);
+
+	/* Allocate new secpath or COW existing one. */
+
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		struct sec_path *sp;
+		sp = secpath_dup(skb->sp);
+		if (!sp)
+			goto drop;
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+	if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
+		goto drop;
+
+	memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
+	skb->sp->len += xfrm_nr;
+
+	if (decaps) {
+		if (!(skb->dev->flags&IFF_LOOPBACK)) {
+			dst_release(skb->dst);
+			skb->dst = NULL;
+		}
+		netif_rx(skb);
+		return 0;
+	} else {
+		return -skb->nh.iph->protocol;
+	}
+
+drop_unlock:
+	spin_unlock(&x->lock);
+	xfrm_state_put(x);
+drop:
+	while (--xfrm_nr >= 0)
+		xfrm_state_put(xfrm_vec[xfrm_nr].xvec);
+
+	kfree_skb(skb);
+	return 0;
+}
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
new file mode 100644
index 000000000000..af2392ae5769
--- /dev/null
+++ b/net/ipv4/xfrm4_output.c
@@ -0,0 +1,141 @@
+/*
+ * xfrm4_output.c - Common IPsec encapsulation code for IPv4.
+ * Copyright (c) 2004 Herbert Xu <herbert@gondor.apana.org.au>
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/icmp.h>
+
+/* Add encapsulation header.
+ *
+ * In transport mode, the IP header will be moved forward to make space
+ * for the encapsulation header.
+ *
+ * In tunnel mode, the top IP header will be constructed per RFC 2401.
+ * The following fields in it shall be filled in by x->type->output:
+ *	tot_len
+ *	check
+ *
+ * On exit, skb->h will be set to the start of the payload to be processed
+ * by x->type->output and skb->nh will be set to the top IP header.
+ */
+static void xfrm4_encap(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	struct iphdr *iph, *top_iph;
+
+	iph = skb->nh.iph;
+	skb->h.ipiph = iph;
+
+	skb->nh.raw = skb_push(skb, x->props.header_len);
+	top_iph = skb->nh.iph;
+
+	if (!x->props.mode) {
+		skb->h.raw += iph->ihl*4;
+		memmove(top_iph, iph, iph->ihl*4);
+		return;
+	}
+
+	top_iph->ihl = 5;
+	top_iph->version = 4;
+
+	/* DS disclosed */
+	top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos);
+	if (x->props.flags & XFRM_STATE_NOECN)
+		IP_ECN_clear(top_iph);
+
+	top_iph->frag_off = iph->frag_off & htons(IP_DF);
+	if (!top_iph->frag_off)
+		__ip_select_ident(top_iph, dst, 0);
+
+	top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT);
+
+	top_iph->saddr = x->props.saddr.a4;
+	top_iph->daddr = x->id.daddr.a4;
+	top_iph->protocol = IPPROTO_IPIP;
+
+	memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+}
+
+static int xfrm4_tunnel_check_size(struct sk_buff *skb)
+{
+	int mtu, ret = 0;
+	struct dst_entry *dst;
+	struct iphdr *iph = skb->nh.iph;
+
+	if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE)
+		goto out;
+
+	IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+	
+	if (!(iph->frag_off & htons(IP_DF)) || skb->local_df)
+		goto out;
+
+	dst = skb->dst;
+	mtu = dst_mtu(dst);
+	if (skb->len > mtu) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+		ret = -EMSGSIZE;
+	}
+out:
+	return ret;
+}
+
+int xfrm4_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb->dst;
+	struct xfrm_state *x = dst->xfrm;
+	int err;
+	
+	if (skb->ip_summed == CHECKSUM_HW) {
+		err = skb_checksum_help(skb, 0);
+		if (err)
+			goto error_nolock;
+	}
+
+	if (x->props.mode) {
+		err = xfrm4_tunnel_check_size(skb);
+		if (err)
+			goto error_nolock;
+	}
+
+	spin_lock_bh(&x->lock);
+	err = xfrm_state_check(x, skb);
+	if (err)
+		goto error;
+
+	xfrm4_encap(skb);
+
+	err = x->type->output(x, skb);
+	if (err)
+		goto error;
+
+	x->curlft.bytes += skb->len;
+	x->curlft.packets++;
+
+	spin_unlock_bh(&x->lock);
+	
+	if (!(skb->dst = dst_pop(dst))) {
+		err = -EHOSTUNREACH;
+		goto error_nolock;
+	}
+	err = NET_XMIT_BYPASS;
+
+out_exit:
+	return err;
+error:
+	spin_unlock_bh(&x->lock);
+error_nolock:
+	kfree_skb(skb);
+	goto out_exit;
+}
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
new file mode 100644
index 000000000000..7fe2afd2e669
--- /dev/null
+++ b/net/ipv4/xfrm4_policy.c
@@ -0,0 +1,281 @@
+/* 
+ * xfrm4_policy.c
+ *
+ * Changes:
+ *	Kazunori MIYAZAWA @USAGI
+ * 	YOSHIFUJI Hideaki @USAGI
+ *		Split up af-specific portion
+ * 	
+ */
+
+#include <linux/config.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+
+static struct dst_ops xfrm4_dst_ops;
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
+
+static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
+
+static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
+{
+	return __ip_route_output_key((struct rtable**)dst, fl);
+}
+
+static struct dst_entry *
+__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy)
+{
+	struct dst_entry *dst;
+
+	read_lock_bh(&policy->lock);
+	for (dst = policy->bundles; dst; dst = dst->next) {
+		struct xfrm_dst *xdst = (struct xfrm_dst*)dst;
+		if (xdst->u.rt.fl.oif == fl->oif &&	/*XXX*/
+		    xdst->u.rt.fl.fl4_dst == fl->fl4_dst &&
+	    	    xdst->u.rt.fl.fl4_src == fl->fl4_src &&
+		    xfrm_bundle_ok(xdst, fl, AF_INET)) {
+			dst_clone(dst);
+			break;
+		}
+	}
+	read_unlock_bh(&policy->lock);
+	return dst;
+}
+
+/* Allocate chain of dst_entry's, attach known xfrm's, calculate
+ * all the metrics... Shortly, bundle a bundle.
+ */
+
+static int
+__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx,
+		      struct flowi *fl, struct dst_entry **dst_p)
+{
+	struct dst_entry *dst, *dst_prev;
+	struct rtable *rt0 = (struct rtable*)(*dst_p);
+	struct rtable *rt = rt0;
+	u32 remote = fl->fl4_dst;
+	u32 local  = fl->fl4_src;
+	struct flowi fl_tunnel = {
+		.nl_u = {
+			.ip4_u = {
+				.saddr = local,
+				.daddr = remote
+			}
+		}
+	};
+	int i;
+	int err;
+	int header_len = 0;
+	int trailer_len = 0;
+
+	dst = dst_prev = NULL;
+	dst_hold(&rt->u.dst);
+
+	for (i = 0; i < nx; i++) {
+		struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops);
+		struct xfrm_dst *xdst;
+		int tunnel = 0;
+
+		if (unlikely(dst1 == NULL)) {
+			err = -ENOBUFS;
+			dst_release(&rt->u.dst);
+			goto error;
+		}
+
+		if (!dst)
+			dst = dst1;
+		else {
+			dst_prev->child = dst1;
+			dst1->flags |= DST_NOHASH;
+			dst_clone(dst1);
+		}
+
+		xdst = (struct xfrm_dst *)dst1;
+		xdst->route = &rt->u.dst;
+
+		dst1->next = dst_prev;
+		dst_prev = dst1;
+		if (xfrm[i]->props.mode) {
+			remote = xfrm[i]->id.daddr.a4;
+			local  = xfrm[i]->props.saddr.a4;
+			tunnel = 1;
+		}
+		header_len += xfrm[i]->props.header_len;
+		trailer_len += xfrm[i]->props.trailer_len;
+
+		if (tunnel) {
+			fl_tunnel.fl4_src = local;
+			fl_tunnel.fl4_dst = remote;
+			err = xfrm_dst_lookup((struct xfrm_dst **)&rt,
+					      &fl_tunnel, AF_INET);
+			if (err)
+				goto error;
+		} else
+			dst_hold(&rt->u.dst);
+	}
+
+	dst_prev->child = &rt->u.dst;
+	dst->path = &rt->u.dst;
+
+	*dst_p = dst;
+	dst = dst_prev;
+
+	dst_prev = *dst_p;
+	i = 0;
+	for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) {
+		struct xfrm_dst *x = (struct xfrm_dst*)dst_prev;
+		x->u.rt.fl = *fl;
+
+		dst_prev->xfrm = xfrm[i++];
+		dst_prev->dev = rt->u.dst.dev;
+		if (rt->u.dst.dev)
+			dev_hold(rt->u.dst.dev);
+		dst_prev->obsolete	= -1;
+		dst_prev->flags	       |= DST_HOST;
+		dst_prev->lastuse	= jiffies;
+		dst_prev->header_len	= header_len;
+		dst_prev->trailer_len	= trailer_len;
+		memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics));
+
+		/* Copy neighbout for reachability confirmation */
+		dst_prev->neighbour	= neigh_clone(rt->u.dst.neighbour);
+		dst_prev->input		= rt->u.dst.input;
+		dst_prev->output	= xfrm4_output;
+		if (rt->peer)
+			atomic_inc(&rt->peer->refcnt);
+		x->u.rt.peer = rt->peer;
+		/* Sheit... I remember I did this right. Apparently,
+		 * it was magically lost, so this code needs audit */
+		x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL);
+		x->u.rt.rt_type = rt->rt_type;
+		x->u.rt.rt_src = rt0->rt_src;
+		x->u.rt.rt_dst = rt0->rt_dst;
+		x->u.rt.rt_gateway = rt->rt_gateway;
+		x->u.rt.rt_spec_dst = rt0->rt_spec_dst;
+		header_len -= x->u.dst.xfrm->props.header_len;
+		trailer_len -= x->u.dst.xfrm->props.trailer_len;
+	}
+
+	xfrm_init_pmtu(dst);
+	return 0;
+
+error:
+	if (dst)
+		dst_free(dst);
+	return err;
+}
+
+static void
+_decode_session4(struct sk_buff *skb, struct flowi *fl)
+{
+	struct iphdr *iph = skb->nh.iph;
+	u8 *xprth = skb->nh.raw + iph->ihl*4;
+
+	memset(fl, 0, sizeof(struct flowi));
+	if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) {
+		switch (iph->protocol) {
+		case IPPROTO_UDP:
+		case IPPROTO_TCP:
+		case IPPROTO_SCTP:
+			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+				u16 *ports = (u16 *)xprth;
+
+				fl->fl_ip_sport = ports[0];
+				fl->fl_ip_dport = ports[1];
+			}
+			break;
+
+		case IPPROTO_ICMP:
+			if (pskb_may_pull(skb, xprth + 2 - skb->data)) {
+				u8 *icmp = xprth;
+
+				fl->fl_icmp_type = icmp[0];
+				fl->fl_icmp_code = icmp[1];
+			}
+			break;
+
+		case IPPROTO_ESP:
+			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+				u32 *ehdr = (u32 *)xprth;
+
+				fl->fl_ipsec_spi = ehdr[0];
+			}
+			break;
+
+		case IPPROTO_AH:
+			if (pskb_may_pull(skb, xprth + 8 - skb->data)) {
+				u32 *ah_hdr = (u32*)xprth;
+
+				fl->fl_ipsec_spi = ah_hdr[1];
+			}
+			break;
+
+		case IPPROTO_COMP:
+			if (pskb_may_pull(skb, xprth + 4 - skb->data)) {
+				u16 *ipcomp_hdr = (u16 *)xprth;
+
+				fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1]));
+			}
+			break;
+		default:
+			fl->fl_ipsec_spi = 0;
+			break;
+		};
+	}
+	fl->proto = iph->protocol;
+	fl->fl4_dst = iph->daddr;
+	fl->fl4_src = iph->saddr;
+}
+
+static inline int xfrm4_garbage_collect(void)
+{
+	read_lock(&xfrm4_policy_afinfo.lock);
+	xfrm4_policy_afinfo.garbage_collect();
+	read_unlock(&xfrm4_policy_afinfo.lock);
+	return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
+}
+
+static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
+{
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+	struct dst_entry *path = xdst->route;
+
+	path->ops->update_pmtu(path, mtu);
+}
+
+static struct dst_ops xfrm4_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		__constant_htons(ETH_P_IP),
+	.gc =			xfrm4_garbage_collect,
+	.update_pmtu =		xfrm4_update_pmtu,
+	.gc_thresh =		1024,
+	.entry_size =		sizeof(struct xfrm_dst),
+};
+
+static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
+	.family = 		AF_INET,
+	.lock = 		RW_LOCK_UNLOCKED,
+	.type_map = 		&xfrm4_type_map,
+	.dst_ops =		&xfrm4_dst_ops,
+	.dst_lookup =		xfrm4_dst_lookup,
+	.find_bundle = 		__xfrm4_find_bundle,
+	.bundle_create =	__xfrm4_bundle_create,
+	.decode_session =	_decode_session4,
+};
+
+static void __init xfrm4_policy_init(void)
+{
+	xfrm_policy_register_afinfo(&xfrm4_policy_afinfo);
+}
+
+static void __exit xfrm4_policy_fini(void)
+{
+	xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo);
+}
+
+void __init xfrm4_init(void)
+{
+	xfrm4_state_init();
+	xfrm4_policy_init();
+}
+
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c
new file mode 100644
index 000000000000..223a2e83853f
--- /dev/null
+++ b/net/ipv4/xfrm4_state.c
@@ -0,0 +1,126 @@
+/*
+ * xfrm4_state.c
+ *
+ * Changes:
+ * 	YOSHIFUJI Hideaki @USAGI
+ * 		Split up af-specific portion
+ *
+ */
+
+#include <net/xfrm.h>
+#include <linux/pfkeyv2.h>
+#include <linux/ipsec.h>
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo;
+
+static void
+__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl,
+		     struct xfrm_tmpl *tmpl,
+		     xfrm_address_t *daddr, xfrm_address_t *saddr)
+{
+	x->sel.daddr.a4 = fl->fl4_dst;
+	x->sel.saddr.a4 = fl->fl4_src;
+	x->sel.dport = xfrm_flowi_dport(fl);
+	x->sel.dport_mask = ~0;
+	x->sel.sport = xfrm_flowi_sport(fl);
+	x->sel.sport_mask = ~0;
+	x->sel.prefixlen_d = 32;
+	x->sel.prefixlen_s = 32;
+	x->sel.proto = fl->proto;
+	x->sel.ifindex = fl->oif;
+	x->id = tmpl->id;
+	if (x->id.daddr.a4 == 0)
+		x->id.daddr.a4 = daddr->a4;
+	x->props.saddr = tmpl->saddr;
+	if (x->props.saddr.a4 == 0)
+		x->props.saddr.a4 = saddr->a4;
+	x->props.mode = tmpl->mode;
+	x->props.reqid = tmpl->reqid;
+	x->props.family = AF_INET;
+}
+
+static struct xfrm_state *
+__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto)
+{
+	unsigned h = __xfrm4_spi_hash(daddr, spi, proto);
+	struct xfrm_state *x;
+
+	list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) {
+		if (x->props.family == AF_INET &&
+		    spi == x->id.spi &&
+		    daddr->a4 == x->id.daddr.a4 &&
+		    proto == x->id.proto) {
+			xfrm_state_hold(x);
+			return x;
+		}
+	}
+	return NULL;
+}
+
+static struct xfrm_state *
+__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, 
+		 xfrm_address_t *daddr, xfrm_address_t *saddr, 
+		 int create)
+{
+	struct xfrm_state *x, *x0;
+	unsigned h = __xfrm4_dst_hash(daddr);
+
+	x0 = NULL;
+
+	list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) {
+		if (x->props.family == AF_INET &&
+		    daddr->a4 == x->id.daddr.a4 &&
+		    mode == x->props.mode &&
+		    proto == x->id.proto &&
+		    saddr->a4 == x->props.saddr.a4 &&
+		    reqid == x->props.reqid &&
+		    x->km.state == XFRM_STATE_ACQ &&
+		    !x->id.spi) {
+			    x0 = x;
+			    break;
+		    }
+	}
+	if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) {
+		x0->sel.daddr.a4 = daddr->a4;
+		x0->sel.saddr.a4 = saddr->a4;
+		x0->sel.prefixlen_d = 32;
+		x0->sel.prefixlen_s = 32;
+		x0->props.saddr.a4 = saddr->a4;
+		x0->km.state = XFRM_STATE_ACQ;
+		x0->id.daddr.a4 = daddr->a4;
+		x0->id.proto = proto;
+		x0->props.family = AF_INET;
+		x0->props.mode = mode;
+		x0->props.reqid = reqid;
+		x0->props.family = AF_INET;
+		x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
+		xfrm_state_hold(x0);
+		x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
+		add_timer(&x0->timer);
+		xfrm_state_hold(x0);
+		list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h);
+		wake_up(&km_waitq);
+	}
+	if (x0)
+		xfrm_state_hold(x0);
+	return x0;
+}
+
+static struct xfrm_state_afinfo xfrm4_state_afinfo = {
+	.family			= AF_INET,
+	.lock			= RW_LOCK_UNLOCKED,
+	.init_tempsel		= __xfrm4_init_tempsel,
+	.state_lookup		= __xfrm4_state_lookup,
+	.find_acq		= __xfrm4_find_acq,
+};
+
+void __init xfrm4_state_init(void)
+{
+	xfrm_state_register_afinfo(&xfrm4_state_afinfo);
+}
+
+void __exit xfrm4_state_fini(void)
+{
+	xfrm_state_unregister_afinfo(&xfrm4_state_afinfo);
+}
+
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
new file mode 100644
index 000000000000..413191f585f6
--- /dev/null
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -0,0 +1,144 @@
+/* xfrm4_tunnel.c: Generic IP tunnel transformer.
+ *
+ * Copyright (C) 2003 David S. Miller (davem@redhat.com)
+ */
+
+#include <linux/skbuff.h>
+#include <linux/module.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+
+static int ipip_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct iphdr *iph;
+	
+	iph = skb->nh.iph;
+	iph->tot_len = htons(skb->len);
+	ip_send_check(iph);
+
+	return 0;
+}
+
+static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb)
+{
+	return 0;
+}
+
+static struct xfrm_tunnel *ipip_handler;
+static DECLARE_MUTEX(xfrm4_tunnel_sem);
+
+int xfrm4_tunnel_register(struct xfrm_tunnel *handler)
+{
+	int ret;
+
+	down(&xfrm4_tunnel_sem);
+	ret = 0;
+	if (ipip_handler != NULL)
+		ret = -EINVAL;
+	if (!ret)
+		ipip_handler = handler;
+	up(&xfrm4_tunnel_sem);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(xfrm4_tunnel_register);
+
+int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler)
+{
+	int ret;
+
+	down(&xfrm4_tunnel_sem);
+	ret = 0;
+	if (ipip_handler != handler)
+		ret = -EINVAL;
+	if (!ret)
+		ipip_handler = NULL;
+	up(&xfrm4_tunnel_sem);
+
+	synchronize_net();
+
+	return ret;
+}
+
+EXPORT_SYMBOL(xfrm4_tunnel_deregister);
+
+static int ipip_rcv(struct sk_buff *skb)
+{
+	struct xfrm_tunnel *handler = ipip_handler;
+
+	/* Tunnel devices take precedence.  */
+	if (handler && handler->handler(skb) == 0)
+		return 0;
+
+	return xfrm4_rcv(skb);
+}
+
+static void ipip_err(struct sk_buff *skb, u32 info)
+{
+	struct xfrm_tunnel *handler = ipip_handler;
+	u32 arg = info;
+
+	if (handler)
+		handler->err_handler(skb, &arg);
+}
+
+static int ipip_init_state(struct xfrm_state *x, void *args)
+{
+	if (!x->props.mode)
+		return -EINVAL;
+
+	if (x->encap)
+		return -EINVAL;
+
+	x->props.header_len = sizeof(struct iphdr);
+
+	return 0;
+}
+
+static void ipip_destroy(struct xfrm_state *x)
+{
+}
+
+static struct xfrm_type ipip_type = {
+	.description	= "IPIP",
+	.owner		= THIS_MODULE,
+	.proto	     	= IPPROTO_IPIP,
+	.init_state	= ipip_init_state,
+	.destructor	= ipip_destroy,
+	.input		= ipip_xfrm_rcv,
+	.output		= ipip_output
+};
+
+static struct net_protocol ipip_protocol = {
+	.handler	=	ipip_rcv,
+	.err_handler	=	ipip_err,
+	.no_policy	=	1,
+};
+
+static int __init ipip_init(void)
+{
+	if (xfrm_register_type(&ipip_type, AF_INET) < 0) {
+		printk(KERN_INFO "ipip init: can't add xfrm type\n");
+		return -EAGAIN;
+	}
+	if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) {
+		printk(KERN_INFO "ipip init: can't add protocol\n");
+		xfrm_unregister_type(&ipip_type, AF_INET);
+		return -EAGAIN;
+	}
+	return 0;
+}
+
+static void __exit ipip_fini(void)
+{
+	if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0)
+		printk(KERN_INFO "ipip close: can't remove protocol\n");
+	if (xfrm_unregister_type(&ipip_type, AF_INET) < 0)
+		printk(KERN_INFO "ipip close: can't remove xfrm type\n");
+}
+
+module_init(ipip_init);
+module_exit(ipip_fini);
+MODULE_LICENSE("GPL");
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 15:20:36 -0700
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /net/ipv4
download	linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.gz linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.tar.bz2 linux-1da177e4c3f41524e886b7f1b8a0c1fc7321cac2.zip