From d67b24c40f81823abe5c1eb808bba1038969142b Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Sat, 24 Sep 2005 16:52:03 -0700
Subject: [NETFILTER]: Fix ip[6]t_NFQUEUE Kconfig dependency

We have to introduce a separate Kconfig menu entry for the NFQUEUE targets.
They cannot "just" depend on nfnetlink_queue, since nfnetlink_queue could
be linked into the kernel, whereas iptables can be a module.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/Kconfig  | 11 +++++++++++
 net/ipv4/netfilter/Makefile |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 3cf9b451675c..2cd7e7d1ac90 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -537,6 +537,17 @@ config IP_NF_TARGET_TCPMSS
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_TARGET_NFQUEUE
+	tristate "NFQUEUE Target Support"
+	depends on IP_NF_IPTABLES
+	help
+	  This Target replaced the old obsolete QUEUE target.
+
+	  As opposed to QUEUE, it supports 65535 different queues,
+	  not just one.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # NAT + specific targets
 config IP_NF_NAT
 	tristate "Full NAT"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 3d45d3c0283c..89002533f2a2 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o
 obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o
 obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o
 obj-$(CONFIG_IP_NF_TARGET_TTL) += ipt_TTL.o
+obj-$(CONFIG_IP_NF_TARGET_NFQUEUE) += ipt_NFQUEUE.o
 
 # generic ARP tables
 obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o
@@ -101,4 +102,3 @@ obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o
 obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o
 
 obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
-obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += ipt_NFQUEUE.o
-- 
cgit v1.2.3


From 8ddec7460d2f5db3ac35812c03676b1473d1d668 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Sat, 24 Sep 2005 16:56:08 -0700
Subject: [NETFILTER] ip_conntrack: Update event cache when status changes

The GRE, SCTP and TCP protocol helpers did not call
ip_conntrack_event_cache() when updating ct->status.  This patch adds
the respective calls.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_proto_gre.c  | 1 +
 net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 1 +
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c  | 3 ++-
 3 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
index de3cb9db6f85..744abb9d377a 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c
@@ -247,6 +247,7 @@ static int gre_packet(struct ip_conntrack *ct,
 				   ct->proto.gre.stream_timeout);
 		/* Also, more likely to be important, and not a probe. */
 		set_bit(IPS_ASSURED_BIT, &ct->status);
+		ip_conntrack_event_cache(IPCT_STATUS, skb);
 	} else
 		ip_ct_refresh_acct(ct, conntrackinfo, skb,
 				   ct->proto.gre.timeout);
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index a875f35e576d..59a4a0111dd3 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -416,6 +416,7 @@ static int sctp_packet(struct ip_conntrack *conntrack,
 		&& newconntrack == SCTP_CONNTRACK_ESTABLISHED) {
 		DEBUGP("Setting assured bit\n");
 		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		ip_conntrack_event_cache(IPCT_STATUS, skb);
 	}
 
 	return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 1985abc59d24..121760d6cc50 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -1014,7 +1014,8 @@ static int tcp_packet(struct ip_conntrack *conntrack,
 		/* Set ASSURED if we see see valid ack in ESTABLISHED 
 		   after SYN_RECV or a valid answer for a picked up 
 		   connection. */
-			set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		set_bit(IPS_ASSURED_BIT, &conntrack->status);
+		ip_conntrack_event_cache(IPCT_STATUS, skb);
 	}
 	ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout);
 
-- 
cgit v1.2.3


From 188bab3ae0ed164bc18f98be932512d777dd038b Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 26 Sep 2005 15:25:11 -0700
Subject: [NETFILTER]: Fix invalid module autoloading by splitting iptable_nat

When you've enabled conntrack and NAT as a module (standard case in all
distributions), and you've also enabled the new conntrack netlink
interface, loading ip_conntrack_netlink.ko will auto-load iptable_nat.ko.
This causes a huge performance penalty, since for every packet you iterate
the nat code, even if you don't want it.

This patch splits iptable_nat.ko into the NAT core (ip_nat.ko) and the
iptables frontend (iptable_nat.ko).  Threfore, ip_conntrack_netlink.ko will
only pull ip_nat.ko, but not the frontend.  ip_nat.ko will "only" allocate
some resources, but not affect runtime performance.

This separation is also a nice step in anticipation of new packet filters
(nf-hipac, ipset, pkttables) being able to use the NAT core.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/Makefile            |  5 +++--
 net/ipv4/netfilter/ip_nat_core.c       | 35 +++++++++++++++++++++++-----------
 net/ipv4/netfilter/ip_nat_helper.c     |  4 ++++
 net/ipv4/netfilter/ip_nat_standalone.c | 25 ++++--------------------
 4 files changed, 35 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 89002533f2a2..dab4b58dd31e 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -4,7 +4,8 @@
 
 # objects for the standalone - connection tracking / NAT
 ip_conntrack-objs	:= ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o
-iptable_nat-objs	:= ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+ip_nat-objs	:= ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o
+iptable_nat-objs	:= ip_nat_rule.o ip_nat_standalone.o
 
 ip_conntrack_pptp-objs	:= ip_conntrack_helper_pptp.o ip_conntrack_proto_gre.o
 ip_nat_pptp-objs	:= ip_nat_helper_pptp.o ip_nat_proto_gre.o
@@ -40,7 +41,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
 obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o
-obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o
+obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o ip_nat.o
 obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o
 
 # matches
diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c
index c3ea891d38e7..c5e3abd24672 100644
--- a/net/ipv4/netfilter/ip_nat_core.c
+++ b/net/ipv4/netfilter/ip_nat_core.c
@@ -74,12 +74,14 @@ ip_nat_proto_find_get(u_int8_t protonum)
 
 	return p;
 }
+EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
 
 void
 ip_nat_proto_put(struct ip_nat_protocol *p)
 {
 	module_put(p->me);
 }
+EXPORT_SYMBOL_GPL(ip_nat_proto_put);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
@@ -111,6 +113,7 @@ ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck)
 	return csum_fold(csum_partial((char *)diffs, sizeof(diffs),
 				      oldcheck^0xFFFF));
 }
+EXPORT_SYMBOL(ip_nat_cheat_check);
 
 /* Is this tuple already taken? (not by us) */
 int
@@ -127,6 +130,7 @@ ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple,
 	invert_tuplepr(&reply, tuple);
 	return ip_conntrack_tuple_taken(&reply, ignored_conntrack);
 }
+EXPORT_SYMBOL(ip_nat_used_tuple);
 
 /* If we source map this tuple so reply looks like reply_tuple, will
  * that meet the constraints of range. */
@@ -347,6 +351,7 @@ ip_nat_setup_info(struct ip_conntrack *conntrack,
 
 	return NF_ACCEPT;
 }
+EXPORT_SYMBOL(ip_nat_setup_info);
 
 /* Returns true if succeeded. */
 static int
@@ -387,10 +392,10 @@ manip_pkt(u_int16_t proto,
 }
 
 /* Do packet manipulations according to ip_nat_setup_info. */
-unsigned int nat_packet(struct ip_conntrack *ct,
-			enum ip_conntrack_info ctinfo,
-			unsigned int hooknum,
-			struct sk_buff **pskb)
+unsigned int ip_nat_packet(struct ip_conntrack *ct,
+			   enum ip_conntrack_info ctinfo,
+			   unsigned int hooknum,
+			   struct sk_buff **pskb)
 {
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 	unsigned long statusbit;
@@ -417,12 +422,13 @@ unsigned int nat_packet(struct ip_conntrack *ct,
 	}
 	return NF_ACCEPT;
 }
+EXPORT_SYMBOL_GPL(ip_nat_packet);
 
 /* Dir is direction ICMP is coming from (opposite to packet it contains) */
-int icmp_reply_translation(struct sk_buff **pskb,
-			   struct ip_conntrack *ct,
-			   enum ip_nat_manip_type manip,
-			   enum ip_conntrack_dir dir)
+int ip_nat_icmp_reply_translation(struct sk_buff **pskb,
+				  struct ip_conntrack *ct,
+				  enum ip_nat_manip_type manip,
+				  enum ip_conntrack_dir dir)
 {
 	struct {
 		struct icmphdr icmp;
@@ -509,6 +515,7 @@ int icmp_reply_translation(struct sk_buff **pskb,
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(ip_nat_icmp_reply_translation);
 
 /* Protocol registration. */
 int ip_nat_protocol_register(struct ip_nat_protocol *proto)
@@ -525,6 +532,7 @@ int ip_nat_protocol_register(struct ip_nat_protocol *proto)
 	write_unlock_bh(&ip_nat_lock);
 	return ret;
 }
+EXPORT_SYMBOL(ip_nat_protocol_register);
 
 /* Noone stores the protocol anywhere; simply delete it. */
 void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
@@ -536,6 +544,7 @@ void ip_nat_protocol_unregister(struct ip_nat_protocol *proto)
 	/* Someone could be still looking at the proto in a bh. */
 	synchronize_net();
 }
+EXPORT_SYMBOL(ip_nat_protocol_unregister);
 
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
@@ -582,7 +591,7 @@ EXPORT_SYMBOL_GPL(ip_nat_port_nfattr_to_range);
 EXPORT_SYMBOL_GPL(ip_nat_port_range_to_nfattr);
 #endif
 
-int __init ip_nat_init(void)
+static int __init ip_nat_init(void)
 {
 	size_t i;
 
@@ -624,10 +633,14 @@ static int clean_nat(struct ip_conntrack *i, void *data)
 	return 0;
 }
 
-/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */
-void ip_nat_cleanup(void)
+static void __exit ip_nat_cleanup(void)
 {
 	ip_ct_iterate_cleanup(&clean_nat, NULL);
 	ip_conntrack_destroyed = NULL;
 	vfree(bysource);
 }
+
+MODULE_LICENSE("GPL");
+
+module_init(ip_nat_init);
+module_exit(ip_nat_cleanup);
diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c
index d2dd5d313556..5d506e0564d5 100644
--- a/net/ipv4/netfilter/ip_nat_helper.c
+++ b/net/ipv4/netfilter/ip_nat_helper.c
@@ -199,6 +199,7 @@ ip_nat_mangle_tcp_packet(struct sk_buff **pskb,
 	}
 	return 1;
 }
+EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
 			
 /* Generic function for mangling variable-length address changes inside
  * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
@@ -256,6 +257,7 @@ ip_nat_mangle_udp_packet(struct sk_buff **pskb,
 
 	return 1;
 }
+EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
 
 /* Adjust one found SACK option including checksum correction */
 static void
@@ -399,6 +401,7 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
 
 	return 1;
 }
+EXPORT_SYMBOL(ip_nat_seq_adjust);
 
 /* Setup NAT on this expected conntrack so it follows master. */
 /* If we fail to get a free NAT slot, we'll get dropped on confirm */
@@ -425,3 +428,4 @@ void ip_nat_follow_master(struct ip_conntrack *ct,
 	/* hook doesn't matter, but it has to do destination manip */
 	ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
 }
+EXPORT_SYMBOL(ip_nat_follow_master);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 0ff368b131f6..30cd4e18c129 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -108,8 +108,8 @@ ip_nat_fn(unsigned int hooknum,
 	case IP_CT_RELATED:
 	case IP_CT_RELATED+IP_CT_IS_REPLY:
 		if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
-			if (!icmp_reply_translation(pskb, ct, maniptype,
-						    CTINFO2DIR(ctinfo)))
+			if (!ip_nat_icmp_reply_translation(pskb, ct, maniptype,
+							   CTINFO2DIR(ctinfo)))
 				return NF_DROP;
 			else
 				return NF_ACCEPT;
@@ -152,7 +152,7 @@ ip_nat_fn(unsigned int hooknum,
 	}
 
 	IP_NF_ASSERT(info);
-	return nat_packet(ct, ctinfo, hooknum, pskb);
+	return ip_nat_packet(ct, ctinfo, hooknum, pskb);
 }
 
 static unsigned int
@@ -325,15 +325,10 @@ static int init_or_cleanup(int init)
 		printk("ip_nat_init: can't setup rules.\n");
 		goto cleanup_nothing;
 	}
-	ret = ip_nat_init();
-	if (ret < 0) {
-		printk("ip_nat_init: can't setup rules.\n");
-		goto cleanup_rule_init;
-	}
 	ret = nf_register_hook(&ip_nat_in_ops);
 	if (ret < 0) {
 		printk("ip_nat_init: can't register in hook.\n");
-		goto cleanup_nat;
+		goto cleanup_rule_init;
 	}
 	ret = nf_register_hook(&ip_nat_out_ops);
 	if (ret < 0) {
@@ -374,8 +369,6 @@ static int init_or_cleanup(int init)
 	nf_unregister_hook(&ip_nat_out_ops);
  cleanup_inops:
 	nf_unregister_hook(&ip_nat_in_ops);
- cleanup_nat:
-	ip_nat_cleanup();
  cleanup_rule_init:
 	ip_nat_rule_cleanup();
  cleanup_nothing:
@@ -395,14 +388,4 @@ static void __exit fini(void)
 module_init(init);
 module_exit(fini);
 
-EXPORT_SYMBOL(ip_nat_setup_info);
-EXPORT_SYMBOL(ip_nat_protocol_register);
-EXPORT_SYMBOL(ip_nat_protocol_unregister);
-EXPORT_SYMBOL_GPL(ip_nat_proto_find_get);
-EXPORT_SYMBOL_GPL(ip_nat_proto_put);
-EXPORT_SYMBOL(ip_nat_cheat_check);
-EXPORT_SYMBOL(ip_nat_mangle_tcp_packet);
-EXPORT_SYMBOL(ip_nat_mangle_udp_packet);
-EXPORT_SYMBOL(ip_nat_used_tuple);
-EXPORT_SYMBOL(ip_nat_follow_master);
 MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 6b251858d377196b8cea20e65cae60f584a42735 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Wed, 28 Sep 2005 16:31:48 -0700
Subject: [TCP]: Fix init_cwnd calculations in tcp_select_initial_window()

Match it up to what RFC2414 really specifies.
Noticed by Rick Jones.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d6e3d269e906..caf2e2cff293 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -190,15 +190,16 @@ void tcp_select_initial_window(int __space, __u32 mss,
 	}
 
 	/* Set initial window to value enough for senders,
-	 * following RFC1414. Senders, not following this RFC,
+	 * following RFC2414. Senders, not following this RFC,
 	 * will be satisfied with 2.
 	 */
 	if (mss > (1<<*rcv_wscale)) {
-		int init_cwnd = 4;
-		if (mss > 1460*3)
+		int init_cwnd;
+
+		if (mss > 1460)
 			init_cwnd = 2;
-		else if (mss > 1460)
-			init_cwnd = 3;
+		else
+			init_cwnd = (mss > 1095) ? 3 : 4;
 		if (*rcv_wnd > init_cwnd*mss)
 			*rcv_wnd = init_cwnd*mss;
 	}
-- 
cgit v1.2.3


From 01ff367e62f0474e4d39aa5812cbe2a30d96e1e9 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 29 Sep 2005 17:07:20 -0700
Subject: [TCP]: Revert 6b251858d377196b8cea20e65cae60f584a42735

But retain the comment fix.

Alexey Kuznetsov has explained the situation as follows:

--------------------

I think the fix is incorrect. Look, the RFC function init_cwnd(mss) is
not continuous: f.e. for mss=1095 it needs initial window 1095*4, but
for mss=1096 it is 1096*3. We do not know exactly what mss sender used
for calculations. If we advertised 1096 (and calculate initial window
3*1096), the sender could limit it to some value < 1096 and then it
will need window his_mss*4 > 3*1096 to send initial burst.

See?

So, the honest function for inital rcv_wnd derived from
tcp_init_cwnd() is:

	init_rcv_wnd(mss)=
	  min { init_cwnd(mss1)*mss1 for mss1 <= mss }

It is something sort of:

	if (mss < 1096)
		return mss*4;
	if (mss < 1096*2)
		return 1096*4;
	return mss*2;

(I just scrablled a graph of piece of paper, it is difficult to see or
to explain without this)

I selected it differently giving more window than it is strictly
required.  Initial receive window must be large enough to allow sender
following to the rfc (or just setting initial cwnd to 2) to send
initial burst.  But besides that it is arbitrary, so I decided to give
slack space of one segment.

Actually, the logic was:

If mss is low/normal (<=ethernet), set window to receive more than
initial burst allowed by rfc under the worst conditions
i.e. mss*4. This gives slack space of 1 segment for ethernet frames.

For msses slighlty more than ethernet frame, take 3. Try to give slack
space of 1 frame again.

If mss is huge, force 2*mss. No slack space.

Value 1460*3 is really confusing. Minimal one is 1096*2, but besides
that it is an arbitrary value. It was meant to be ~4096. 1460*3 is
just the magic number from RFC, 1460*3 = 1095*4 is the magic :-), so
that I guess hands typed this themselves.

--------------------

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index caf2e2cff293..c5b911f9b662 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -194,12 +194,11 @@ void tcp_select_initial_window(int __space, __u32 mss,
 	 * will be satisfied with 2.
 	 */
 	if (mss > (1<<*rcv_wscale)) {
-		int init_cwnd;
-
-		if (mss > 1460)
+		int init_cwnd = 4;
+		if (mss > 1460*3)
 			init_cwnd = 2;
-		else
-			init_cwnd = (mss > 1095) ? 3 : 4;
+		else if (mss > 1460)
+			init_cwnd = 3;
 		if (*rcv_wnd > init_cwnd*mss)
 			*rcv_wnd = init_cwnd*mss;
 	}
-- 
cgit v1.2.3


From 09e9ec87111ba818d8171262b15ba4c357eb1d27 Mon Sep 17 00:00:00 2001
From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
Date: Thu, 29 Sep 2005 17:17:15 -0700
Subject: [TCP]: Don't over-clamp window in tcp_clamp_window()

From: Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>

Handle better the case where the sender sends full sized
frames initially, then moves to a mode where it trickles
out small amounts of data at a time.

This known problem is even mentioned in the comments
above tcp_grow_window() in tcp_input.c, specifically:

...
 * The scheme does not work when sender sends good segments opening
 * window and then starts to feed us spagetti. But it should work
 * in common situations. Otherwise, we have to rely on queue collapsing.
...

When the sender gives full sized frames, the "struct sk_buff" overhead
from each packet is small.  So we'll advertize a larger window.
If the sender moves to a mode where small segments are sent, this
ratio becomes tilted to the other extreme and we start overrunning
the socket buffer space.

tcp_clamp_window() tries to address this, but it's clamping of
tp->window_clamp is a wee bit too aggressive for this particular case.

Fix confirmed by Ion Badulescu.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a7537c7bbd06..677419d0c9ad 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -355,8 +355,6 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp)
 			app_win -= icsk->icsk_ack.rcv_mss;
 		app_win = max(app_win, 2U*tp->advmss);
 
-		if (!ofo_win)
-			tp->window_clamp = min(tp->window_clamp, app_win);
 		tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);
 	}
 }
-- 
cgit v1.2.3


From 325ed8239309cb29f10ea58c5a668058ead11479 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 3 Oct 2005 13:57:23 -0700
Subject: [NET]: Fix packet timestamping.

I've found the problem in general.  It affects any 64-bit
architecture.  The problem occurs when you change the system time.

Suppose that when you boot your system clock is forward by a day.
This gets recorded down in skb_tv_base.  You then wind the clock back
by a day.  From that point onwards the offset will be negative which
essentially overflows the 32-bit variables they're stored in.

In fact, why don't we just store the real time stamp in those 32-bit
variables? After all, we're not going to overflow for quite a while
yet.

When we do overflow, we'll need a better solution of course.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_queue.c | 4 ++--
 net/ipv4/netfilter/ipt_ULOG.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index d54f14d926f6..36339eb39e17 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -240,8 +240,8 @@ ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp)
 
 	pmsg->packet_id       = (unsigned long )entry;
 	pmsg->data_len        = data_len;
-	pmsg->timestamp_sec   = skb_tv_base.tv_sec + entry->skb->tstamp.off_sec;
-	pmsg->timestamp_usec  = skb_tv_base.tv_usec + entry->skb->tstamp.off_usec;
+	pmsg->timestamp_sec   = entry->skb->tstamp.off_sec;
+	pmsg->timestamp_usec  = entry->skb->tstamp.off_usec;
 	pmsg->mark            = entry->skb->nfmark;
 	pmsg->hook            = entry->info->hook;
 	pmsg->hw_protocol     = entry->skb->protocol;
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index e2c14f3cb2fc..2883ccd8a91d 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -225,8 +225,8 @@ static void ipt_ulog_packet(unsigned int hooknum,
 
 	/* copy hook, prefix, timestamp, payload, etc. */
 	pm->data_len = copy_len;
-	pm->timestamp_sec = skb_tv_base.tv_sec + skb->tstamp.off_sec;
-	pm->timestamp_usec = skb_tv_base.tv_usec + skb->tstamp.off_usec;
+	pm->timestamp_sec = skb->tstamp.off_sec;
+	pm->timestamp_usec = skb->tstamp.off_usec;
 	pm->mark = skb->nfmark;
 	pm->hook = hooknum;
 	if (prefix != NULL)
-- 
cgit v1.2.3


From 81c3d5470ecc70564eb9209946730fe2be93ad06 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 3 Oct 2005 14:13:38 -0700
Subject: [INET]: speedup inet (tcp/dccp) lookups

Arnaldo and I agreed it could be applied now, because I have other
pending patches depending on this one (Thank you Arnaldo)

(The other important patch moves skc_refcnt in a separate cache line,
so that the SMP/NUMA performance doesnt suffer from cache line ping pongs)

1) First some performance data :
--------------------------------

tcp_v4_rcv() wastes a *lot* of time in __inet_lookup_established()

The most time critical code is :

sk_for_each(sk, node, &head->chain) {
     if (INET_MATCH(sk, acookie, saddr, daddr, ports, dif))
         goto hit; /* You sunk my battleship! */
}

The sk_for_each() does use prefetch() hints but only the begining of
"struct sock" is prefetched.

As INET_MATCH first comparison uses inet_sk(__sk)->daddr, wich is far
away from the begining of "struct sock", it has to bring into CPU
cache cold cache line. Each iteration has to use at least 2 cache
lines.

This can be problematic if some chains are very long.

2) The goal
-----------

The idea I had is to change things so that INET_MATCH() may return
FALSE in 99% of cases only using the data already in the CPU cache,
using one cache line per iteration.

3) Description of the patch
---------------------------

Adds a new 'unsigned int skc_hash' field in 'struct sock_common',
filling a 32 bits hole on 64 bits platform.

struct sock_common {
	unsigned short		skc_family;
	volatile unsigned char	skc_state;
	unsigned char		skc_reuse;
	int			skc_bound_dev_if;
	struct hlist_node	skc_node;
	struct hlist_node	skc_bind_node;
	atomic_t		skc_refcnt;
+	unsigned int		skc_hash;
	struct proto		*skc_prot;
};

Store in this 32 bits field the full hash, not masked by (ehash_size -
1) Using this full hash as the first comparison done in INET_MATCH
permits us immediatly skip the element without touching a second cache
line in case of a miss.

Suppress the sk_hashent/tw_hashent fields since skc_hash (aliased to
sk_hash and tw_hash) already contains the slot number if we mask with
(ehash_size - 1)

File include/net/inet_hashtables.h

64 bits platforms :
#define INET_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
     (((__sk)->sk_hash == (__hash))
     ((*((__u64 *)&(inet_sk(__sk)->daddr)))== (__cookie))   &&  \
     ((*((__u32 *)&(inet_sk(__sk)->dport))) == (__ports))   &&  \
     (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))

32bits platforms:
#define TCP_IPV4_MATCH(__sk, __hash, __cookie, __saddr, __daddr, __ports, __dif)\
     (((__sk)->sk_hash == (__hash))                 &&  \
     (inet_sk(__sk)->daddr          == (__saddr))   &&  \
     (inet_sk(__sk)->rcv_saddr      == (__daddr))   &&  \
     (!((__sk)->sk_bound_dev_if) || ((__sk)->sk_bound_dev_if == (__dif))))


- Adds a prefetch(head->chain.first) in
__inet_lookup_established()/__tcp_v4_check_established() and
__inet6_lookup_established()/__tcp_v6_check_established() and
__dccp_v4_check_established() to bring into cache the first element of the
list, before the {read|write}_lock(&head->lock);

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Acked-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_timewait_sock.c |  6 +++---
 net/ipv4/tcp_ipv4.c           | 11 ++++++-----
 2 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 4d1502a49852..f9076ef3a1a8 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -20,7 +20,7 @@ void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashi
 	struct inet_bind_hashbucket *bhead;
 	struct inet_bind_bucket *tb;
 	/* Unlink from established hashes. */
-	struct inet_ehash_bucket *ehead = &hashinfo->ehash[tw->tw_hashent];
+	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, tw->tw_hash);
 
 	write_lock(&ehead->lock);
 	if (hlist_unhashed(&tw->tw_node)) {
@@ -60,7 +60,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
 {
 	const struct inet_sock *inet = inet_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct inet_ehash_bucket *ehead = &hashinfo->ehash[sk->sk_hashent];
+	struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
 	struct inet_bind_hashbucket *bhead;
 	/* Step 1: Put TW into bind hash. Original socket stays there too.
 	   Note, that any socket with inet->num != 0 MUST be bound in
@@ -106,7 +106,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_dport	    = inet->dport;
 		tw->tw_family	    = sk->sk_family;
 		tw->tw_reuse	    = sk->sk_reuse;
-		tw->tw_hashent	    = sk->sk_hashent;
+		tw->tw_hash	    = sk->sk_hash;
 		tw->tw_ipv6only	    = 0;
 		tw->tw_prot	    = sk->sk_prot_creator;
 		atomic_set(&tw->tw_refcnt, 1);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 13dfb391cdf1..c85819d8474b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -130,19 +130,20 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 	int dif = sk->sk_bound_dev_if;
 	INET_ADDR_COOKIE(acookie, saddr, daddr)
 	const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport);
-	const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
-	struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
+	unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport);
+	struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash);
 	struct sock *sk2;
 	const struct hlist_node *node;
 	struct inet_timewait_sock *tw;
 
+	prefetch(head->chain.first);
 	write_lock(&head->lock);
 
 	/* Check TIME-WAIT sockets first. */
 	sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
 		tw = inet_twsk(sk2);
 
-		if (INET_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
+		if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) {
 			const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2);
 			struct tcp_sock *tp = tcp_sk(sk);
 
@@ -179,7 +180,7 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
 
 	/* And established part... */
 	sk_for_each(sk2, node, &head->chain) {
-		if (INET_MATCH(sk2, acookie, saddr, daddr, ports, dif))
+		if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif))
 			goto not_unique;
 	}
 
@@ -188,7 +189,7 @@ unique:
 	 * in hash table socket with a funny identity. */
 	inet->num = lport;
 	inet->sport = htons(lport);
-	sk->sk_hashent = hash;
+	sk->sk_hash = hash;
 	BUG_TRAP(sk_unhashed(sk));
 	__sk_add_node(sk, &head->chain);
 	sock_prot_inc_use(sk->sk_prot);
-- 
cgit v1.2.3


From 444fc8fc3a1f926fa224655b8950bd853368c1a3 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 3 Oct 2005 14:18:10 -0700
Subject: [IPV4]: Fix "Proxy ARP seems broken"

Meelis Roos <mroos@linux.ee> wrote:
> RK> My firewall setup relies on proxyarp working.  However, with 2.6.14-rc3,
> RK> it appears to be completely broken.  The firewall is 212.18.232.186,
>
> Same here with some kernel between 14-rc2 and 14-rc3 - no reposnse to
> ARP on a proxyarp gateway. Sorry, no exact revison and no more debugging
> yet since it'a a production gateway.

The breakage is caused by the change to use the CB area for flagging
whether a packet has been queued due to proxy_delay.  This area gets
cleared every time arp_rcv gets called.  Unfortunately packets delayed
due to proxy_delay also go through arp_rcv when they are reprocessed.

In fact, I can't think of a reason why delayed proxy packets should go
through netfilter again at all.  So the easiest solution is to bypass
that and go straight to arp_process.

This is essentially what would've happened before netfilter support
was added to ARP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 8bf312bdea13..ec0e36893b01 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -697,12 +697,6 @@ void arp_send(int type, int ptype, u32 dest_ip,
 	arp_xmit(skb);
 }
 
-static void parp_redo(struct sk_buff *skb)
-{
-	nf_reset(skb);
-	arp_rcv(skb, skb->dev, NULL, skb->dev);
-}
-
 /*
  *	Process an arp request.
  */
@@ -922,6 +916,11 @@ out:
 	return 0;
 }
 
+static void parp_redo(struct sk_buff *skb)
+{
+	arp_process(skb);
+}
+
 
 /*
  *	Receive an arp request from the device layer.
-- 
cgit v1.2.3


From e5ed639913eea3e4783a550291775ab78dd84966 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 3 Oct 2005 14:35:55 -0700
Subject: [IPV4]: Replace __in_dev_get with __in_dev_get_rcu/rtnl

The following patch renames __in_dev_get() to __in_dev_get_rtnl() and
introduces __in_dev_get_rcu() to cover the second case.

1) RCU with refcnt should use in_dev_get().
2) RCU without refcnt should use __in_dev_get_rcu().
3) All others must hold RTNL and use __in_dev_get_rtnl().

There is one exception in net/ipv4/route.c which is in fact a pre-existing
race condition.  I've marked it as such so that we remember to fix it.

This patch is based on suggestions and prior work by Suzanne Wood and
Paul McKenney.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c                               | 10 +++++-----
 net/ipv4/devinet.c                           | 22 +++++++++++-----------
 net/ipv4/fib_frontend.c                      |  4 ++--
 net/ipv4/fib_semantics.c                     |  4 ++--
 net/ipv4/igmp.c                              |  2 +-
 net/ipv4/ip_gre.c                            |  4 ++--
 net/ipv4/ipmr.c                              |  6 +++---
 net/ipv4/netfilter/ip_conntrack_netbios_ns.c |  2 +-
 net/ipv4/netfilter/ipt_REDIRECT.c            |  2 +-
 net/ipv4/route.c                             |  6 ++++--
 10 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index ec0e36893b01..b425748f02d7 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -241,7 +241,7 @@ static int arp_constructor(struct neighbour *neigh)
 	neigh->type = inet_addr_type(addr);
 
 	rcu_read_lock();
-	in_dev = rcu_dereference(__in_dev_get(dev));
+	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev == NULL) {
 		rcu_read_unlock();
 		return -EINVAL;
@@ -989,8 +989,8 @@ static int arp_req_set(struct arpreq *r, struct net_device * dev)
 			ipv4_devconf.proxy_arp = 1;
 			return 0;
 		}
-		if (__in_dev_get(dev)) {
-			__in_dev_get(dev)->cnf.proxy_arp = 1;
+		if (__in_dev_get_rtnl(dev)) {
+			__in_dev_get_rtnl(dev)->cnf.proxy_arp = 1;
 			return 0;
 		}
 		return -ENXIO;
@@ -1095,8 +1095,8 @@ static int arp_req_delete(struct arpreq *r, struct net_device * dev)
 				ipv4_devconf.proxy_arp = 0;
 				return 0;
 			}
-			if (__in_dev_get(dev)) {
-				__in_dev_get(dev)->cnf.proxy_arp = 0;
+			if (__in_dev_get_rtnl(dev)) {
+				__in_dev_get_rtnl(dev)->cnf.proxy_arp = 0;
 				return 0;
 			}
 			return -ENXIO;
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index ba2895ae8151..74f2207e131a 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -351,7 +351,7 @@ static int inet_insert_ifa(struct in_ifaddr *ifa)
 
 static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
 {
-	struct in_device *in_dev = __in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 
 	ASSERT_RTNL();
 
@@ -449,7 +449,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
 		goto out;
 
 	rc = -ENOBUFS;
-	if ((in_dev = __in_dev_get(dev)) == NULL) {
+	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
 		in_dev = inetdev_init(dev);
 		if (!in_dev)
 			goto out;
@@ -584,7 +584,7 @@ int devinet_ioctl(unsigned int cmd, void __user *arg)
 	if (colon)
 		*colon = ':';
 
-	if ((in_dev = __in_dev_get(dev)) != NULL) {
+	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
 		if (tryaddrmatch) {
 			/* Matthias Andree */
 			/* compare label and address (4.4BSD style) */
@@ -748,7 +748,7 @@ rarok:
 
 static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
 {
-	struct in_device *in_dev = __in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 	struct in_ifaddr *ifa;
 	struct ifreq ifr;
 	int done = 0;
@@ -791,7 +791,7 @@ u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope)
 	struct in_device *in_dev;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get(dev);
+	in_dev = __in_dev_get_rcu(dev);
 	if (!in_dev)
 		goto no_in_dev;
 
@@ -818,7 +818,7 @@ no_in_dev:
 	read_lock(&dev_base_lock);
 	rcu_read_lock();
 	for (dev = dev_base; dev; dev = dev->next) {
-		if ((in_dev = __in_dev_get(dev)) == NULL)
+		if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
 			continue;
 
 		for_primary_ifa(in_dev) {
@@ -887,7 +887,7 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
 
 	if (dev) {
 		rcu_read_lock();
-		if ((in_dev = __in_dev_get(dev)))
+		if ((in_dev = __in_dev_get_rcu(dev)))
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
 		rcu_read_unlock();
 
@@ -897,7 +897,7 @@ u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scop
 	read_lock(&dev_base_lock);
 	rcu_read_lock();
 	for (dev = dev_base; dev; dev = dev->next) {
-		if ((in_dev = __in_dev_get(dev))) {
+		if ((in_dev = __in_dev_get_rcu(dev))) {
 			addr = confirm_addr_indev(in_dev, dst, local, scope);
 			if (addr)
 				break;
@@ -957,7 +957,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 			 void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct in_device *in_dev = __in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 
 	ASSERT_RTNL();
 
@@ -1078,7 +1078,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
 		if (idx > s_idx)
 			s_ip_idx = 0;
 		rcu_read_lock();
-		if ((in_dev = __in_dev_get(dev)) == NULL) {
+		if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
 			rcu_read_unlock();
 			continue;
 		}
@@ -1149,7 +1149,7 @@ void inet_forward_change(void)
 	for (dev = dev_base; dev; dev = dev->next) {
 		struct in_device *in_dev;
 		rcu_read_lock();
-		in_dev = __in_dev_get(dev);
+		in_dev = __in_dev_get_rcu(dev);
 		if (in_dev)
 			in_dev->cnf.forwarding = on;
 		rcu_read_unlock();
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4e1379f71269..e61bc7177eb1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -173,7 +173,7 @@ int fib_validate_source(u32 src, u32 dst, u8 tos, int oif,
 
 	no_addr = rpf = 0;
 	rcu_read_lock();
-	in_dev = __in_dev_get(dev);
+	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
 		rpf = IN_DEV_RPFILTER(in_dev);
@@ -607,7 +607,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct in_device *in_dev = __in_dev_get(dev);
+	struct in_device *in_dev = __in_dev_get_rtnl(dev);
 
 	if (event == NETDEV_UNREGISTER) {
 		fib_disable_ip(dev, 2);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index d41219e8037c..186f20c4a45e 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1087,7 +1087,7 @@ fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm,
 		rta->rta_oif = &dev->ifindex;
 		if (colon) {
 			struct in_ifaddr *ifa;
-			struct in_device *in_dev = __in_dev_get(dev);
+			struct in_device *in_dev = __in_dev_get_rtnl(dev);
 			if (!in_dev)
 				return -ENODEV;
 			*colon = ':';
@@ -1268,7 +1268,7 @@ int fib_sync_up(struct net_device *dev)
 			}
 			if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
 				continue;
-			if (nh->nh_dev != dev || __in_dev_get(dev) == NULL)
+			if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
 				continue;
 			alive++;
 			spin_lock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 70c44e4c3ceb..8b6d3939e1e6 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1323,7 +1323,7 @@ static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr)
 	}
 	if (dev) {
 		imr->imr_ifindex = dev->ifindex;
-		idev = __in_dev_get(dev);
+		idev = __in_dev_get_rtnl(dev);
 	}
 	return idev;
 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f0d5740d7e22..896ce3f8f53a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1104,10 +1104,10 @@ static int ipgre_open(struct net_device *dev)
 			return -EADDRNOTAVAIL;
 		dev = rt->u.dst.dev;
 		ip_rt_put(rt);
-		if (__in_dev_get(dev) == NULL)
+		if (__in_dev_get_rtnl(dev) == NULL)
 			return -EADDRNOTAVAIL;
 		t->mlink = dev->ifindex;
-		ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr);
+		ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
 	}
 	return 0;
 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 9dbf5909f3a6..302b7eb507c9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -149,7 +149,7 @@ struct net_device *ipmr_new_tunnel(struct vifctl *v)
 		if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
 			dev->flags |= IFF_MULTICAST;
 
-			in_dev = __in_dev_get(dev);
+			in_dev = __in_dev_get_rtnl(dev);
 			if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
 				goto failure;
 			in_dev->cnf.rp_filter = 0;
@@ -278,7 +278,7 @@ static int vif_delete(int vifi)
 
 	dev_set_allmulti(dev, -1);
 
-	if ((in_dev = __in_dev_get(dev)) != NULL) {
+	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
 		in_dev->cnf.mc_forwarding--;
 		ip_rt_multicast_event(in_dev);
 	}
@@ -421,7 +421,7 @@ static int vif_add(struct vifctl *vifc, int mrtsock)
 		return -EINVAL;
 	}
 
-	if ((in_dev = __in_dev_get(dev)) == NULL)
+	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
 		return -EADDRNOTAVAIL;
 	in_dev->cnf.mc_forwarding++;
 	dev_set_allmulti(dev, +1);
diff --git a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
index 577bac22dcc6..186646eb249f 100644
--- a/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
+++ b/net/ipv4/netfilter/ip_conntrack_netbios_ns.c
@@ -58,7 +58,7 @@ static int help(struct sk_buff **pskb,
 		goto out;
 
 	rcu_read_lock();
-	in_dev = __in_dev_get(rt->u.dst.dev);
+	in_dev = __in_dev_get_rcu(rt->u.dst.dev);
 	if (in_dev != NULL) {
 		for_primary_ifa(in_dev) {
 			if (ifa->ifa_broadcast == iph->daddr) {
diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c
index 715cb613405c..5245bfd33d52 100644
--- a/net/ipv4/netfilter/ipt_REDIRECT.c
+++ b/net/ipv4/netfilter/ipt_REDIRECT.c
@@ -93,7 +93,7 @@ redirect_target(struct sk_buff **pskb,
 		newdst = 0;
 		
 		rcu_read_lock();
-		indev = __in_dev_get((*pskb)->dev);
+		indev = __in_dev_get_rcu((*pskb)->dev);
 		if (indev && (ifa = indev->ifa_list))
 			newdst = ifa->ifa_local;
 		rcu_read_unlock();
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8549f26e2495..381dd6a6aebb 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2128,7 +2128,7 @@ int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr,
 		struct in_device *in_dev;
 
 		rcu_read_lock();
-		if ((in_dev = __in_dev_get(dev)) != NULL) {
+		if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
 			int our = ip_check_mc(in_dev, daddr, saddr,
 				skb->nh.iph->protocol);
 			if (our
@@ -2443,7 +2443,9 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
 		err = -ENODEV;
 		if (dev_out == NULL)
 			goto out;
-		if (__in_dev_get(dev_out) == NULL) {
+
+		/* RACE: Check return value of inet_select_addr instead. */
+		if (__in_dev_get_rtnl(dev_out) == NULL) {
 			dev_put(dev_out);
 			goto out;	/* Wrong error code */
 		}
-- 
cgit v1.2.3


From 7ce312467edc270fcbd8a699efabb37ce1802b98 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Mon, 3 Oct 2005 16:07:30 -0700
Subject: [IPV4]: Update icmp sysctl docs and disable broadcast ECHO/TIMESTAMP
 by default

It's not a good idea to be smurf'able by default.
The few people who need this can turn it on.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 24eb56ae1b5a..90dca711ac9f 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -188,7 +188,7 @@ struct icmp_err icmp_err_convert[] = {
 
 /* Control parameters for ECHO replies. */
 int sysctl_icmp_echo_ignore_all;
-int sysctl_icmp_echo_ignore_broadcasts;
+int sysctl_icmp_echo_ignore_broadcasts = 1;
 
 /* Control parameter - ignore bogus broadcast responses? */
 int sysctl_icmp_ignore_bogus_error_responses;
-- 
cgit v1.2.3


From e6308be85afee685347fa3440bed10faaa5d6c1a Mon Sep 17 00:00:00 2001
From: Robert Olsson <robert.olsson@its.uu.se>
Date: Tue, 4 Oct 2005 13:01:58 -0700
Subject: [IPV4]: fib_trie root-node expansion

The patch below introduces special thresholds to keep root node in the trie
large. This gives a flatter tree at the cost of a modest memory increase.
Overall it seems to be gain and this was also proposed by one the authors
of the paper in recent a seminar.

Main table after loading 123 k routes.

	Aver depth:     3.30
	Max depth:      9
        Root-node size  12 bits
        Total size: 4044  kB

With the patch:
	Aver depth:     2.78
	Max depth:      8
        Root-node size  15 bits
        Total size: 4150  kB

An increase of 8-10% was seen in forwading performance for an rDoS attack.

Signed-off-by: Robert Olsson <robert.olsson@its.uu.se>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 50c0519cd70d..0093ea08c7f5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -286,6 +286,8 @@ static inline void check_tnode(const struct tnode *tn)
 
 static int halve_threshold = 25;
 static int inflate_threshold = 50;
+static int halve_threshold_root = 15;
+static int inflate_threshold_root = 25; 
 
 
 static void __alias_free_mem(struct rcu_head *head)
@@ -449,6 +451,8 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	int i;
 	int err = 0;
 	struct tnode *old_tn;
+	int inflate_threshold_use;
+	int halve_threshold_use;
 
  	if (!tn)
 		return NULL;
@@ -541,10 +545,17 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 	check_tnode(tn);
 
+	/* Keep root node larger  */
+
+	if(!tn->parent)
+		inflate_threshold_use = inflate_threshold_root;
+	else 
+		inflate_threshold_use = inflate_threshold;
+
 	err = 0;
 	while ((tn->full_children > 0 &&
 	       50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
-				inflate_threshold * tnode_child_length(tn))) {
+				inflate_threshold_use * tnode_child_length(tn))) {
 
 		old_tn = tn;
 		tn = inflate(t, tn);
@@ -564,10 +575,18 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	 * node is above threshold.
 	 */
 
+
+	/* Keep root node larger  */
+
+	if(!tn->parent)
+		halve_threshold_use = halve_threshold_root;
+	else 
+		halve_threshold_use = halve_threshold;
+
 	err = 0;
 	while (tn->bits > 1 &&
 	       100 * (tnode_child_length(tn) - tn->empty_children) <
-	       halve_threshold * tnode_child_length(tn)) {
+	       halve_threshold_use * tnode_child_length(tn)) {
 
 		old_tn = tn;
 		tn = halve(t, tn);
-- 
cgit v1.2.3


From a5181ab06ddca8071b4eb54ac2c314f7d24825d4 Mon Sep 17 00:00:00 2001
From: "Horst H. von Brand" <vonbrand@inf.utfsm.cl>
Date: Tue, 4 Oct 2005 15:58:56 -0700
Subject: [NETFILTER]: Fix Kconfig typo

Signed-off-by: Horst H. von Brand <vonbrand@inf.utfsm.cl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 2cd7e7d1ac90..a7659728e7a0 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -141,7 +141,7 @@ config IP_NF_PPTP
 	tristate  'PPTP protocol support'
 	help
 	  This module adds support for PPTP (Point to Point Tunnelling
-	  Protocol, RFC2637) conncection tracking and NAT. 
+	  Protocol, RFC2637) connection tracking and NAT. 
 	
 	  If you are running PPTP sessions over a stateful firewall or NAT
 	  box, you may want to enable this feature.  
-- 
cgit v1.2.3


From 8eea00a44d9f493869f8d30b72e3ed18475be556 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Tue, 4 Oct 2005 22:42:15 -0700
Subject: [IPVS]: fix sparse gfp nocast warnings

From: Randy Dunlap <rdunlap@xenotime.net>

Fix implicit nocast warnings in ip_vs code:
net/ipv4/ipvs/ip_vs_app.c:631:54: warning: implicit cast to nocast type

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipvs/ip_vs_app.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index 6e092dadb388..b942ff3c8860 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -604,7 +604,7 @@ static struct file_operations ip_vs_app_fops = {
 /*
  *	Replace a segment of data with a new segment
  */
-int ip_vs_skb_replace(struct sk_buff *skb, int pri,
+int ip_vs_skb_replace(struct sk_buff *skb, unsigned int __nocast pri,
 		      char *o_buf, int o_len, char *n_buf, int n_len)
 {
 	struct iphdr *iph;
-- 
cgit v1.2.3


From 42a39450f830c57432fd4e5644fa81f41ce7156d Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@osdl.org>
Date: Wed, 5 Oct 2005 12:09:31 -0700
Subject: [TCP]: BIC coding bug in Linux 2.6.13

Missing parenthesis in causes BIC to be slow in increasing congestion
window.

Spotted by Injong Rhee.

Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_bic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index b940346de4e7..6d80e063c187 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -136,7 +136,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd)
 		else if (cwnd < ca->last_max_cwnd + max_increment*(BICTCP_B-1))
 			/* slow start */
 			ca->cnt = (cwnd * (BICTCP_B-1))
-				/ cwnd-ca->last_max_cwnd;
+				/ (cwnd - ca->last_max_cwnd);
 		else
 			/* linear increase */
 			ca->cnt = cwnd / max_increment;
-- 
cgit v1.2.3


From dd0fc66fb33cd610bc1a5db8a5e232d34879b4d7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Fri, 7 Oct 2005 07:46:04 +0100
Subject: [PATCH] gfp flags annotations - part 1

 - added typedef unsigned int __nocast gfp_t;

 - replaced __nocast uses for gfp flags with gfp_t - it gives exactly
   the same warnings as far as sparse is concerned, doesn't change
   generated code (from gcc point of view we replaced unsigned int with
   typedef) and documents what's going on far better.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 net/ipv4/inet_connection_sock.c | 2 +-
 net/ipv4/ipvs/ip_vs_app.c       | 2 +-
 net/ipv4/tcp_output.c           | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fe3c6d3d0c91..94468a76c5b4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -494,7 +494,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
 
 struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
-			    const unsigned int __nocast priority)
+			    const gfp_t priority)
 {
 	struct sock *newsk = sk_clone(sk, priority);
 
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index b942ff3c8860..fc6f95aaa969 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -604,7 +604,7 @@ static struct file_operations ip_vs_app_fops = {
 /*
  *	Replace a segment of data with a new segment
  */
-int ip_vs_skb_replace(struct sk_buff *skb, unsigned int __nocast pri,
+int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
 		      char *o_buf, int o_len, char *n_buf, int n_len)
 {
 	struct iphdr *iph;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5b911f9b662..8225e4257258 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1610,7 +1610,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
-- 
cgit v1.2.3


From 85d9b05d9b1edad9a2630584754720a957ab0a2a Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 20:47:42 -0700
Subject: [NETFILTER] PPTP helper: Add missing Kconfig dependency

PPTP should not be selectable without conntrack enabled

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a7659728e7a0..4b6f80775fb0 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -139,6 +139,7 @@ config IP_NF_AMANDA
 
 config IP_NF_PPTP
 	tristate  'PPTP protocol support'
+	depends on IP_NF_CONNTRACK
 	help
 	  This module adds support for PPTP (Point to Point Tunnelling
 	  Protocol, RFC2637) connection tracking and NAT. 
-- 
cgit v1.2.3


From f40863cec87464f3f4ec3a6c00e3fda3bbb0c91b Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 20:51:53 -0700
Subject: [NETFILTER] ipt_ULOG: Mark ipt_ULOG as OBSOLETE

Similar to nfnetlink_queue and ip_queue, we mark ipt_ULOG as obsolete.
This should have been part of the original nfnetlink_log merge, but
I somehow missed it.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/Kconfig | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 4b6f80775fb0..7d917e4ce1d9 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -499,9 +499,14 @@ config IP_NF_TARGET_LOG
 	  To compile it as a module, choose M here.  If unsure, say N.
 
 config IP_NF_TARGET_ULOG
-	tristate "ULOG target support"
+	tristate "ULOG target support (OBSOLETE)"
 	depends on IP_NF_IPTABLES
 	---help---
+
+	  This option enables the old IPv4-only "ipt_ULOG" implementation
+	  which has been obsoleted by the new "nfnetlink_log" code (see
+	  CONFIG_NETFILTER_NETLINK_LOG).
+
 	  This option adds a `ULOG' target, which allows you to create rules in
 	  any iptables table. The packet is passed to a userspace logging
 	  daemon using netlink multicast sockets; unlike the LOG target
-- 
cgit v1.2.3


From d000eaf7720cb12cd03cd3d55f71be44357d27a9 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 20:52:51 -0700
Subject: [NETFILTER] conntrack_netlink: Fix endian issue with status from
 userspace

When we send "status" from userspace, we forget to convert the endianness.
This patch adds the reqired conversion.  Thanks to Pablo Neira for
discovering this.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_netlink.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index b08a432efcf8..eade2749915a 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -833,7 +833,8 @@ out:
 static inline int
 ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[])
 {
-	unsigned long d, status = *(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]);
+	unsigned long d;
+	unsigned status = ntohl(*(u_int32_t *)NFA_DATA(cda[CTA_STATUS-1]));
 	d = ct->status ^ status;
 
 	if (d & (IPS_EXPECTED|IPS_CONFIRMED|IPS_DYING))
-- 
cgit v1.2.3


From a1bcc3f26885b0a8bf04799551de2e9574ccbda1 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Oct 2005 20:53:16 -0700
Subject: [NETFILTER] ctnetlink: ICMP ID is not mandatory

The ID is only required by ICMP type 8 (echo), so it's not
mandatory for all sort of ICMP connections. This patch makes
mandatory only the type and the code for ICMP netlink messages.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 838d1d69b36e..98f0015dd255 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -296,8 +296,7 @@ static int icmp_nfattr_to_tuple(struct nfattr *tb[],
 				struct ip_conntrack_tuple *tuple)
 {
 	if (!tb[CTA_PROTO_ICMP_TYPE-1]
-	    || !tb[CTA_PROTO_ICMP_CODE-1]
-	    || !tb[CTA_PROTO_ICMP_ID-1])
+	    || !tb[CTA_PROTO_ICMP_CODE-1])
 		return -1;
 
 	tuple->dst.u.icmp.type = 
-- 
cgit v1.2.3


From e1c73b78e3706bd3c336d4730a01dd4081dfb7ee Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Oct 2005 20:55:49 -0700
Subject: [NETFILTER] ctnetlink: add one nesting level for TCP state

To keep consistency, the TCP private protocol information is nested
attributes under CTA_PROTOINFO_TCP. This way the sequence of attributes to
access the TCP state information looks like here below:

CTA_PROTOINFO
CTA_PROTOINFO_TCP
CTA_PROTOINFO_TCP_STATE

instead of:

CTA_PROTOINFO
CTA_PROTOINFO_TCP_STATE

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 121760d6cc50..75e27e65c28f 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -341,11 +341,15 @@ static int tcp_print_conntrack(struct seq_file *s,
 static int tcp_to_nfattr(struct sk_buff *skb, struct nfattr *nfa,
 			 const struct ip_conntrack *ct)
 {
+	struct nfattr *nest_parms = NFA_NEST(skb, CTA_PROTOINFO_TCP);
+	
 	read_lock_bh(&tcp_lock);
 	NFA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t),
 		&ct->proto.tcp.state);
 	read_unlock_bh(&tcp_lock);
 
+	NFA_NEST_END(skb, nest_parms);
+
 	return 0;
 
 nfattr_failure:
-- 
cgit v1.2.3


From a02a64223eddb410712b015fb3342c9a316ab70b Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 10 Oct 2005 21:11:08 -0700
Subject: [IPSEC]: Use ALIGN macro in ESP

This patch uses the macro ALIGN in all the applicable spots for ESP.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/esp4.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 1b5a09d1b90b..e911c6dd8296 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -5,6 +5,7 @@
 #include <net/esp.h>
 #include <asm/scatterlist.h>
 #include <linux/crypto.h>
+#include <linux/kernel.h>
 #include <linux/pfkeyv2.h>
 #include <linux/random.h>
 #include <net/icmp.h>
@@ -42,10 +43,10 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	esp = x->data;
 	alen = esp->auth.icv_trunc_len;
 	tfm = esp->conf.tfm;
-	blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3;
-	clen = (clen + 2 + blksize-1)&~(blksize-1);
+	blksize = ALIGN(crypto_tfm_alg_blocksize(tfm), 4);
+	clen = ALIGN(clen + 2, blksize);
 	if (esp->conf.padlen)
-		clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+		clen = ALIGN(clen, esp->conf.padlen);
 
 	if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0)
 		goto error;
@@ -307,13 +308,13 @@ static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
 	u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
 
 	if (x->props.mode) {
-		mtu = (mtu + 2 + blksize-1)&~(blksize-1);
+		mtu = ALIGN(mtu + 2, blksize);
 	} else {
 		/* The worst case. */
 		mtu += 2 + blksize;
 	}
 	if (esp->conf.padlen)
-		mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1);
+		mtu = ALIGN(mtu, esp->conf.padlen);
 
 	return mtu + x->props.header_len + esp->auth.icv_trunc_len;
 }
-- 
cgit v1.2.3


From d4875b049b2e6401a6e1fae90b7f09e20a636fcf Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 10 Oct 2005 21:11:34 -0700
Subject: [IPSEC] Fix block size/MTU bugs in ESP

This patch fixes the following bugs in ESP:

* Fix transport mode MTU overestimate.  This means that the inner MTU
  is smaller than it needs be.  Worse yet, given an input MTU which
  is a multiple of 4 it will always produce an estimate which is not
  a multiple of 4.

  For example, given a standard ESP/3DES/MD5 transform and an MTU of
  1500, the resulting MTU for transport mode is 1462 when it should
  be 1464.

  The reason for this is because IP header lengths are always a multiple
  of 4 for IPv4 and 8 for IPv6.

* Ensure that the block size is at least 4.  This is required by RFC2406
  and corresponds to what the esp_output function does.  At the moment
  this only affects crypto_null as its block size is 1.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/esp4.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index e911c6dd8296..1b18ce66e7b7 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -144,7 +144,7 @@ static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struc
 	struct ip_esp_hdr *esph;
 	struct esp_data *esp = x->data;
 	struct sk_buff *trailer;
-	int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+	int blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
 	int alen = esp->auth.icv_trunc_len;
 	int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen;
 	int nfrags;
@@ -305,13 +305,13 @@ static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap,
 static u32 esp4_get_max_size(struct xfrm_state *x, int mtu)
 {
 	struct esp_data *esp = x->data;
-	u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm);
+	u32 blksize = ALIGN(crypto_tfm_alg_blocksize(esp->conf.tfm), 4);
 
 	if (x->props.mode) {
 		mtu = ALIGN(mtu + 2, blksize);
 	} else {
 		/* The worst case. */
-		mtu += 2 + blksize;
+		mtu = ALIGN(mtu + 2, 4) + blksize - 4;
 	}
 	if (esp->conf.padlen)
 		mtu = ALIGN(mtu, esp->conf.padlen);
-- 
cgit v1.2.3


From a051a8f7306476af0a74370ad56e793cb6c43bf7 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@netfilter.org>
Date: Mon, 10 Oct 2005 21:21:10 -0700
Subject: [NETFILTER]: Use only 32bit counters for CONNTRACK_ACCT

Initially we used 64bit counters for conntrack-based accounting, since we
had no event mechanism to tell userspace that our counters are about to
overflow.  With nfnetlink_conntrack, we now have such a event mechanism and
thus can save 16bytes per connection.

Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_core.c    | 13 ++++++++-----
 net/ipv4/netfilter/ip_conntrack_netlink.c |  8 ++++----
 2 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index ea65dd3e517a..07a80b56e8dc 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -1119,7 +1119,7 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
 			unsigned long extra_jiffies,
 			int do_acct)
 {
-	int do_event = 0;
+	int event = 0;
 
 	IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
 	IP_NF_ASSERT(skb);
@@ -1129,13 +1129,13 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
 	/* If not in hash table, timer will not be active yet */
 	if (!is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
-		do_event = 1;
+		event = IPCT_REFRESH;
 	} else {
 		/* Need del_timer for race avoidance (may already be dying). */
 		if (del_timer(&ct->timeout)) {
 			ct->timeout.expires = jiffies + extra_jiffies;
 			add_timer(&ct->timeout);
-			do_event = 1;
+			event = IPCT_REFRESH;
 		}
 	}
 
@@ -1144,14 +1144,17 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct,
 		ct->counters[CTINFO2DIR(ctinfo)].packets++;
 		ct->counters[CTINFO2DIR(ctinfo)].bytes += 
 						ntohs(skb->nh.iph->tot_len);
+		if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
+		    || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
+			event |= IPCT_COUNTER_FILLING;
 	}
 #endif
 
 	write_unlock_bh(&ip_conntrack_lock);
 
 	/* must be unlocked when calling event cache */
-	if (do_event)
-		ip_conntrack_event_cache(IPCT_REFRESH, skb);
+	if (event)
+		ip_conntrack_event_cache(event, skb);
 }
 
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index eade2749915a..06ed91ee8ace 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -177,11 +177,11 @@ ctnetlink_dump_counters(struct sk_buff *skb, const struct ip_conntrack *ct,
 	struct nfattr *nest_count = NFA_NEST(skb, type);
 	u_int64_t tmp;
 
-	tmp = cpu_to_be64(ct->counters[dir].packets);
-	NFA_PUT(skb, CTA_COUNTERS_PACKETS, sizeof(u_int64_t), &tmp);
+	tmp = htonl(ct->counters[dir].packets);
+	NFA_PUT(skb, CTA_COUNTERS32_PACKETS, sizeof(u_int32_t), &tmp);
 
-	tmp = cpu_to_be64(ct->counters[dir].bytes);
-	NFA_PUT(skb, CTA_COUNTERS_BYTES, sizeof(u_int64_t), &tmp);
+	tmp = htonl(ct->counters[dir].bytes);
+	NFA_PUT(skb, CTA_COUNTERS32_BYTES, sizeof(u_int32_t), &tmp);
 
 	NFA_NEST_END(skb, nest_count);
 
-- 
cgit v1.2.3


From 339231537506846cb232a2f0cc4a2c662b2d5b07 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Oct 2005 21:23:28 -0700
Subject: [NETFILTER] ctnetlink: allow userspace to change TCP state

This patch adds the ability of changing the state a TCP connection. I know
that this must be used with care but it's required to provide a complete
conntrack creation via conntrack_netlink. So I'll document this aspect on
the upcoming docs.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 75e27e65c28f..d6701cafbcc2 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -356,6 +356,28 @@ nfattr_failure:
 	read_unlock_bh(&tcp_lock);
 	return -1;
 }
+
+static int nfattr_to_tcp(struct nfattr *cda[], struct ip_conntrack *ct)
+{
+	struct nfattr *attr = cda[CTA_PROTOINFO_TCP-1];
+	struct nfattr *tb[CTA_PROTOINFO_TCP_MAX];
+
+        if (nfattr_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr) < 0)
+                goto nfattr_failure;
+
+	if (!tb[CTA_PROTOINFO_TCP_STATE-1])
+		return -EINVAL;
+
+	write_lock_bh(&tcp_lock);
+	ct->proto.tcp.state = 
+		*(u_int8_t *)NFA_DATA(tb[CTA_PROTOINFO_TCP_STATE-1]);
+	write_unlock_bh(&tcp_lock);
+
+	return 0;
+
+nfattr_failure:
+	return -1;
+}
 #endif
 
 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
@@ -1127,6 +1149,7 @@ struct ip_conntrack_protocol ip_conntrack_protocol_tcp =
 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
 	.to_nfattr		= tcp_to_nfattr,
+	.from_nfattr		= nfattr_to_tcp,
 	.tuple_to_nfattr	= ip_ct_port_tuple_to_nfattr,
 	.nfattr_to_tuple	= ip_ct_port_nfattr_to_tuple,
 #endif
-- 
cgit v1.2.3


From 061cb4a0ec34a6e3069d5a1b3c547e55a71498c5 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 10 Oct 2005 21:23:46 -0700
Subject: [NETFILTER] ctnetlink: add support to change protocol info

This patch add support to change the state of the private protocol
information via conntrack_netlink.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ip_conntrack_netlink.c | 37 +++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c
index 06ed91ee8ace..166e6069f121 100644
--- a/net/ipv4/netfilter/ip_conntrack_netlink.c
+++ b/net/ipv4/netfilter/ip_conntrack_netlink.c
@@ -949,6 +949,31 @@ ctnetlink_change_timeout(struct ip_conntrack *ct, struct nfattr *cda[])
 	return 0;
 }
 
+static inline int
+ctnetlink_change_protoinfo(struct ip_conntrack *ct, struct nfattr *cda[])
+{
+	struct nfattr *tb[CTA_PROTOINFO_MAX], *attr = cda[CTA_PROTOINFO-1];
+	struct ip_conntrack_protocol *proto;
+	u_int16_t npt = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum;
+	int err = 0;
+
+	if (nfattr_parse_nested(tb, CTA_PROTOINFO_MAX, attr) < 0)
+		goto nfattr_failure;
+
+	proto = ip_conntrack_proto_find_get(npt);
+	if (!proto)
+		return -EINVAL;
+
+	if (proto->from_nfattr)
+		err = proto->from_nfattr(tb, ct);
+	ip_conntrack_proto_put(proto); 
+
+	return err;
+
+nfattr_failure:
+	return -ENOMEM;
+}
+
 static int
 ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
 {
@@ -974,6 +999,12 @@ ctnetlink_change_conntrack(struct ip_conntrack *ct, struct nfattr *cda[])
 			return err;
 	}
 
+	if (cda[CTA_PROTOINFO-1]) {
+		err = ctnetlink_change_protoinfo(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
 	DEBUGP("all done\n");
 	return 0;
 }
@@ -1003,6 +1034,12 @@ ctnetlink_create_conntrack(struct nfattr *cda[],
 	if (err < 0)
 		goto err;
 
+	if (cda[CTA_PROTOINFO-1]) {
+		err = ctnetlink_change_protoinfo(ct, cda);
+		if (err < 0)
+			return err;
+	}
+
 	ct->helper = ip_conntrack_helper_find_get(rtuple);
 
 	add_timer(&ct->timeout);
-- 
cgit v1.2.3


From eeb2b8560676e454ad37ee30b49bc7d897edc9be Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Mon, 10 Oct 2005 21:25:23 -0700
Subject: [TWSK]: Grab the module refcount for timewait sockets

This is required to avoid unloading a module that has active timewait
sockets, such as DCCP.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/inet_timewait_sock.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index f9076ef3a1a8..a010e9a68811 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -111,6 +111,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
 		tw->tw_prot	    = sk->sk_prot_creator;
 		atomic_set(&tw->tw_refcnt, 1);
 		inet_twsk_dead_node_init(tw);
+		__module_get(tw->tw_prot->owner);
 	}
 
 	return tw;
-- 
cgit v1.2.3


From 9ff5c59ce278c37bca22fbf98076d199bcaf9845 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 12 Oct 2005 15:59:39 -0700
Subject: [TCP]: Add code to help track down "BUG at
 net/ipv4/tcp_output.c:438!"

This is the second report of this bug.  Unfortunately the first
reporter hasn't been able to reproduce it since to provide more
debugging info.

So let's apply this patch for 2.6.14 to

1) Make this non-fatal.
2) Provide the info we need to track it down.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8225e4257258..f37a50e55b68 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -435,7 +435,14 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
 	int nsize, old_factor;
 	u16 flags;
 
-	BUG_ON(len >= skb->len);
+	if (unlikely(len >= skb->len)) {
+		printk(KERN_DEBUG "TCP: seg_size=%u, mss=%u, seq=%u, "
+		       "end_seq=%u, skb->len=%u.\n", len, mss_now,
+		       TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+		       skb->len);
+		WARN_ON(1);
+		return 0;
+	}
 
 	nsize = skb_headlen(skb) - len;
 	if (nsize < 0)
-- 
cgit v1.2.3


From c8923c6b852d3a97c1faad0566e38fca330375a7 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Thu, 13 Oct 2005 14:41:23 -0700
Subject: [NETFILTER]: Fix OOPSes on machines with discontiguous cpu numbering.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Original patch by Harald Welte, with feedback from Herbert Xu
and testing by S�bastien Bernard.

EBTABLES, ARP tables, and IP/IP6 tables all assume that cpus
are numbered linearly.  That is not necessarily true.

This patch fixes that up by calculating the largest possible
cpu number, and allocating enough per-cpu structure space given
that.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/arp_tables.c | 14 +++++++++-----
 net/ipv4/netfilter/ip_tables.c  | 17 +++++++++++------
 2 files changed, 20 insertions(+), 11 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index fa1634256680..a7969286e6e7 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -716,8 +716,10 @@ static int translate_table(const char *name,
 	}
 
 	/* And one copy for every other CPU */
-	for (i = 1; i < num_possible_cpus(); i++) {
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+	for_each_cpu(i) {
+		if (i == 0)
+			continue;
+		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
 		       newinfo->entries,
 		       SMP_ALIGN(newinfo->size));
 	}
@@ -767,7 +769,7 @@ static void get_counters(const struct arpt_table_info *t,
 	unsigned int cpu;
 	unsigned int i;
 
-	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+	for_each_cpu(cpu) {
 		i = 0;
 		ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
 				   t->size,
@@ -885,7 +887,8 @@ static int do_replace(void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	newinfo = vmalloc(sizeof(struct arpt_table_info)
-			  + SMP_ALIGN(tmp.size) * num_possible_cpus());
+			  + SMP_ALIGN(tmp.size) *
+			  		(highest_possible_processor_id()+1));
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1158,7 +1161,8 @@ int arpt_register_table(struct arpt_table *table,
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
 
 	newinfo = vmalloc(sizeof(struct arpt_table_info)
-			  + SMP_ALIGN(repl->size) * num_possible_cpus());
+			  + SMP_ALIGN(repl->size) *
+			  		(highest_possible_processor_id()+1));
 	if (!newinfo) {
 		ret = -ENOMEM;
 		return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index eef99a1b5de6..75c27e92f6ab 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -27,6 +27,7 @@
 #include <asm/semaphore.h>
 #include <linux/proc_fs.h>
 #include <linux/err.h>
+#include <linux/cpumask.h>
 
 #include <linux/netfilter_ipv4/ip_tables.h>
 
@@ -921,8 +922,10 @@ translate_table(const char *name,
 	}
 
 	/* And one copy for every other CPU */
-	for (i = 1; i < num_possible_cpus(); i++) {
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
+	for_each_cpu(i) {
+		if (i == 0)
+			continue;
+		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i,
 		       newinfo->entries,
 		       SMP_ALIGN(newinfo->size));
 	}
@@ -943,7 +946,7 @@ replace_table(struct ipt_table *table,
 		struct ipt_entry *table_base;
 		unsigned int i;
 
-		for (i = 0; i < num_possible_cpus(); i++) {
+		for_each_cpu(i) {
 			table_base =
 				(void *)newinfo->entries
 				+ TABLE_OFFSET(newinfo, i);
@@ -990,7 +993,7 @@ get_counters(const struct ipt_table_info *t,
 	unsigned int cpu;
 	unsigned int i;
 
-	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+	for_each_cpu(cpu) {
 		i = 0;
 		IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
 				  t->size,
@@ -1128,7 +1131,8 @@ do_replace(void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(tmp.size) * num_possible_cpus());
+			  + SMP_ALIGN(tmp.size) * 
+			  	(highest_possible_processor_id()+1));
 	if (!newinfo)
 		return -ENOMEM;
 
@@ -1458,7 +1462,8 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl)
 		= { 0, 0, 0, { 0 }, { 0 }, { } };
 
 	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(repl->size) * num_possible_cpus());
+			  + SMP_ALIGN(repl->size) * 
+			  		(highest_possible_processor_id()+1));
 	if (!newinfo)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 046d20b73960b7a2474b6d5e920d54c3fd7c23fe Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 13 Oct 2005 14:42:24 -0700
Subject: [TCP]: Ratelimit debugging warning.

Better safe than sorry.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f37a50e55b68..7114031fdc70 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -436,11 +436,13 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss
 	u16 flags;
 
 	if (unlikely(len >= skb->len)) {
-		printk(KERN_DEBUG "TCP: seg_size=%u, mss=%u, seq=%u, "
-		       "end_seq=%u, skb->len=%u.\n", len, mss_now,
-		       TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
-		       skb->len);
-		WARN_ON(1);
+		if (net_ratelimit()) {
+			printk(KERN_DEBUG "TCP: seg_size=%u, mss=%u, seq=%u, "
+			       "end_seq=%u, skb->len=%u.\n", len, mss_now,
+			       TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
+			       skb->len);
+			WARN_ON(1);
+		}
 		return 0;
 	}
 
-- 
cgit v1.2.3