summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/ip_vs.h2
-rw-r--r--include/net/ip_vs.h44
-rw-r--r--net/netfilter/ipvs/Kconfig13
-rw-r--r--net/netfilter/ipvs/Makefile5
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c13
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c46
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c12
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c146
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c292
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c98
10 files changed, 475 insertions, 196 deletions
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 003d75f6ffe1..df7728613720 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -90,10 +90,12 @@
#define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */
/* Flags that are not sent to backup server start from bit 16 */
+#define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */
/* Connection flags from destination that can be changed by user space */
#define IP_VS_CONN_F_DEST_MASK (IP_VS_CONN_F_FWD_MASK | \
IP_VS_CONN_F_ONE_PACKET | \
+ IP_VS_CONN_F_NFCT | \
0)
#define IP_VS_SCHEDNAME_MAXLEN 16
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 62698a9c1631..e8ec5231eae9 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -25,7 +25,9 @@
#include <linux/ip.h>
#include <linux/ipv6.h> /* for struct ipv6hdr */
#include <net/ipv6.h> /* for ipv6_addr_copy */
-
+#ifdef CONFIG_IP_VS_NFCT
+#include <net/netfilter/nf_conntrack.h>
+#endif
/* Connections' size value needed by ip_vs_ctl.c */
extern int ip_vs_conn_tab_size;
@@ -798,6 +800,7 @@ extern int sysctl_ip_vs_expire_nodest_conn;
extern int sysctl_ip_vs_expire_quiescent_template;
extern int sysctl_ip_vs_sync_threshold[2];
extern int sysctl_ip_vs_nat_icmp_send;
+extern int sysctl_ip_vs_conntrack;
extern struct ip_vs_stats ip_vs_stats;
extern const struct ctl_path net_vs_ctl_path[];
@@ -955,8 +958,47 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum)
return csum_partial(diff, sizeof(diff), oldsum);
}
+#ifdef CONFIG_IP_VS_NFCT
+/*
+ * Netfilter connection tracking
+ * (from ip_vs_nfct.c)
+ */
+static inline int ip_vs_conntrack_enabled(void)
+{
+ return sysctl_ip_vs_conntrack;
+}
+
extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp,
int outin);
+extern int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp);
+extern void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs);
+extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp);
+
+#else
+
+static inline int ip_vs_conntrack_enabled(void)
+{
+ return 0;
+}
+
+static inline void ip_vs_update_conntrack(struct sk_buff *skb,
+ struct ip_vs_conn *cp, int outin)
+{
+}
+
+static inline int ip_vs_confirm_conntrack(struct sk_buff *skb,
+ struct ip_vs_conn *cp)
+{
+ return NF_ACCEPT;
+}
+
+static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+}
+/* CONFIG_IP_VS_NFCT */
+#endif
#endif /* __KERNEL__ */
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index 46a77d5c3887..af3c9f48f2d7 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -3,7 +3,7 @@
#
menuconfig IP_VS
tristate "IP virtual server support"
- depends on NET && INET && NETFILTER && NF_CONNTRACK
+ depends on NET && INET && NETFILTER
---help---
IP Virtual Server support will let you build a high-performance
virtual server based on cluster of two or more real servers. This
@@ -235,7 +235,8 @@ comment 'IPVS application helper'
config IP_VS_FTP
tristate "FTP protocol helper"
- depends on IP_VS_PROTO_TCP && NF_NAT
+ depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT
+ select IP_VS_NFCT
---help---
FTP is a protocol that transfers IP address and/or port number in
the payload. In the virtual server via Network Address Translation,
@@ -247,4 +248,12 @@ config IP_VS_FTP
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_NFCT
+ bool "Netfilter connection tracking"
+ depends on NF_CONNTRACK
+ ---help---
+ The Netfilter connection tracking support allows the IPVS
+ connection state to be exported to the Netfilter framework
+ for filtering purposes.
+
endif # IP_VS
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index e3baefd7066e..349fe8819b89 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o
ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o
+ip_vs-extra_objs-y :=
+ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o
+
ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \
ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \
ip_vs_est.o ip_vs_proto.o \
- $(ip_vs_proto-objs-y)
+ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y)
# IPVS core
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9fe1da7bcf16..a970d9691496 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -721,6 +721,9 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
+ if (cp->flags & IP_VS_CONN_F_NFCT)
+ ip_vs_conn_drop_conntrack(cp);
+
if (unlikely(cp->app != NULL))
ip_vs_unbind_app(cp);
ip_vs_unbind_dest(cp);
@@ -816,6 +819,16 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport,
if (unlikely(pp && atomic_read(&pp->appcnt)))
ip_vs_bind_app(cp, pp);
+ /*
+ * Allow conntrack to be preserved. By default, conntrack
+ * is created and destroyed for every packet.
+ * Sometimes keeping conntrack can be useful for
+ * IP_VS_CONN_F_ONE_PACKET too.
+ */
+
+ if (ip_vs_conntrack_enabled())
+ cp->flags |= IP_VS_CONN_F_NFCT;
+
/* Hash it in the ip_vs_conn_tab finally */
ip_vs_conn_hash(cp);
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 319991d4d251..7fbc80d81fe8 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -537,6 +537,23 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
return NF_DROP;
}
+/*
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
+ * chain and is used to avoid double NAT and confirmation when we do
+ * not want to keep the conntrack structure
+ */
+static unsigned int ip_vs_post_routing(unsigned int hooknum,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ if (!skb->ipvs_property)
+ return NF_ACCEPT;
+ /* The packet was sent from IPVS, exit this chain */
+ return NF_STOP;
+}
+
__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
@@ -695,7 +712,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
- skb->ipvs_property = 1;
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ skb->ipvs_property = 1;
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
verdict = NF_ACCEPT;
out:
@@ -928,17 +948,19 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
- ip_vs_update_conntrack(skb, cp, 0);
+ if (!(cp->flags & IP_VS_CONN_F_NFCT))
+ skb->ipvs_property = 1;
+ else
+ ip_vs_update_conntrack(skb, cp, 0);
ip_vs_conn_put(cp);
- skb->ipvs_property = 1;
-
LeaveFunction(11);
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
kfree_skb(skb);
+ LeaveFunction(11);
return NF_STOLEN;
}
@@ -1483,6 +1505,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
+ /* Before the netfilter connection tracking, exit from POST_ROUTING */
+ {
+ .hook = ip_vs_post_routing,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC-1,
+ },
#ifdef CONFIG_IP_VS_IPV6
/* After packet filtering, forward packet through VS/DR, VS/TUN,
* or VS/NAT(change destination), so that filtering rules can be
@@ -1511,6 +1541,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
.hooknum = NF_INET_FORWARD,
.priority = 99,
},
+ /* Before the netfilter connection tracking, exit from POST_ROUTING */
+ {
+ .hook = ip_vs_post_routing,
+ .owner = THIS_MODULE,
+ .pf = PF_INET6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_NAT_SRC-1,
+ },
#endif
};
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7bd41d28080c..d2d842f292c6 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -88,6 +88,9 @@ int sysctl_ip_vs_expire_nodest_conn = 0;
int sysctl_ip_vs_expire_quiescent_template = 0;
int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
int sysctl_ip_vs_nat_icmp_send = 0;
+#ifdef CONFIG_IP_VS_NFCT
+int sysctl_ip_vs_conntrack;
+#endif
#ifdef CONFIG_IP_VS_DEBUG
@@ -1580,6 +1583,15 @@ static struct ctl_table vs_vars[] = {
.mode = 0644,
.proc_handler = proc_do_defense_mode,
},
+#ifdef CONFIG_IP_VS_NFCT
+ {
+ .procname = "conntrack",
+ .data = &sysctl_ip_vs_conntrack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
+#endif
{
.procname = "secure_tcp",
.data = &sysctl_ip_vs_secure_tcp,
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 7e9af5b76d9e..9cd375f94d61 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -20,17 +20,6 @@
*
* Author: Wouter Gadeyne
*
- *
- * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from
- * http://www.ssi.bg/~ja/nfct/:
- *
- * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
- *
- * Portions Copyright (C) 2001-2002
- * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
- *
- * Portions Copyright (C) 2003-2008
- * Julian Anastasov
*/
#define KMSG_COMPONENT "IPVS"
@@ -58,16 +47,6 @@
#define SERVER_STRING "227 Entering Passive Mode ("
#define CLIENT_STRING "PORT "
-#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
- &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
- (T)->dst.protonum
-
-#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
- &((C)->vaddr.ip), ntohs((C)->vport), \
- &((C)->daddr.ip), ntohs((C)->dport), \
- (C)->protocol, (C)->state
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv;
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
{
+ /* We use connection tracking for the command connection */
+ cp->flags |= IP_VS_CONN_F_NFCT;
return 0;
}
@@ -149,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
}
/*
- * Called from init_conntrack() as expectfn handler.
- */
-static void
-ip_vs_expect_callback(struct nf_conn *ct,
- struct nf_conntrack_expect *exp)
-{
- struct nf_conntrack_tuple *orig, new_reply;
- struct ip_vs_conn *cp;
-
- if (exp->tuple.src.l3num != PF_INET)
- return;
-
- /*
- * We assume that no NF locks are held before this callback.
- * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
- * expectations even if they use wildcard values, now we provide the
- * actual values from the newly created original conntrack direction.
- * The conntrack is confirmed when packet reaches IPVS hooks.
- */
-
- /* RS->CLIENT */
- orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
- cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
- &orig->src.u3, orig->src.u.tcp.port,
- &orig->dst.u3, orig->dst.u.tcp.port);
- if (cp) {
- /* Change reply CLIENT->RS to CLIENT->VS */
- new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- new_reply.dst.u3 = cp->vaddr;
- new_reply.dst.u.tcp.port = cp->vport;
- IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
- ", inout cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- goto alter;
- }
-
- /* CLIENT->VS */
- cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
- &orig->src.u3, orig->src.u.tcp.port,
- &orig->dst.u3, orig->dst.u.tcp.port);
- if (cp) {
- /* Change reply VS->CLIENT to RS->CLIENT */
- new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- new_reply.src.u3 = cp->daddr;
- new_reply.src.u.tcp.port = cp->dport;
- IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", outin cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
- goto alter;
- }
-
- IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE
- " - unknown expect\n",
- __func__, ct, ct->status, ARG_TUPLE(orig));
- return;
-
-alter:
- /* Never alter conntrack for non-NAT conns */
- if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
- nf_conntrack_alter_reply(ct, &new_reply);
- ip_vs_conn_put(cp);
- return;
-}
-
-/*
- * Create NF conntrack expectation with wildcard (optional) source port.
- * Then the default callback function will alter the reply and will confirm
- * the conntrack entry when the first packet comes.
- */
-static void
-ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct,
- struct ip_vs_conn *cp, u_int8_t proto,
- const __be16 *port, int from_rs)
-{
- struct nf_conntrack_expect *exp;
-
- BUG_ON(!ct || ct == &nf_conntrack_untracked);
-
- exp = nf_ct_expect_alloc(ct);
- if (!exp)
- return;
-
- if (from_rs)
- nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
- nf_ct_l3num(ct), &cp->daddr, &cp->caddr,
- proto, port, &cp->cport);
- else
- nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT,
- nf_ct_l3num(ct), &cp->caddr, &cp->vaddr,
- proto, port, &cp->vport);
-
- exp->expectfn = ip_vs_expect_callback;
-
- IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&exp->tuple));
- nf_ct_expect_related(exp);
- nf_ct_expect_put(exp);
-}
-
-/*
* Look at outgoing ftp packets to catch the response to a PASV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
@@ -335,7 +202,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
&cp->caddr, 0,
&cp->vaddr, port,
&from, port,
- IP_VS_CONN_F_NO_CPORT,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
cp->dest);
if (!n_cp)
return 0;
@@ -371,8 +239,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
start-data, end-start,
buf, buf_len);
if (ret)
- ip_vs_expect_related(skb, ct, n_cp,
- IPPROTO_TCP, NULL, 0);
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ IPPROTO_TCP, 0, 0);
}
/*
@@ -487,7 +355,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
&to, port,
&cp->vaddr, htons(ntohs(cp->vport)-1),
&cp->daddr, htons(ntohs(cp->dport)-1),
- 0,
+ IP_VS_CONN_F_NFCT,
cp->dest);
if (!n_cp)
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
new file mode 100644
index 000000000000..c038458d0290
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -0,0 +1,292 @@
+/*
+ * ip_vs_nfct.c: Netfilter connection tracking support for IPVS
+ *
+ * Portions Copyright (C) 2001-2002
+ * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland.
+ *
+ * Portions Copyright (C) 2003-2010
+ * Julian Anastasov
+ *
+ *
+ * This code is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * Authors:
+ * Ben North <ben@redfrontdoor.org>
+ * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels
+ * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match
+ *
+ *
+ * Current status:
+ *
+ * - provide conntrack confirmation for new and related connections, by
+ * this way we can see their proper conntrack state in all hooks
+ * - support for all forwarding methods, not only NAT
+ * - FTP support (NAT), ability to support other NAT apps with expectations
+ * - to correctly create expectations for related NAT connections the proper
+ * NF conntrack support must be already installed, eg. ip_vs_ftp requires
+ * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables
+ * NAT rules are needed)
+ * - alter reply for NAT when forwarding packet in original direction:
+ * conntrack from client in NEW or RELATED (Passive FTP DATA) state or
+ * when RELATED conntrack is created from real server (Active FTP DATA)
+ * - if iptables_nat is not loaded the Passive FTP will not work (the
+ * PASV response can not be NAT-ed) but Active FTP should work
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/compiler.h>
+#include <linux/vmalloc.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <net/ip_vs.h>
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+
+
+#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
+#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
+ &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+ (T)->dst.protonum
+
+#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
+#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
+ &((C)->vaddr.ip), ntohs((C)->vport), \
+ &((C)->daddr.ip), ntohs((C)->dport), \
+ (C)->protocol, (C)->state
+
+void
+ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo);
+ struct nf_conntrack_tuple new_tuple;
+
+ if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) ||
+ nf_ct_is_dying(ct))
+ return;
+
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
+ return;
+
+ /* Alter reply only in original direction */
+ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
+ return;
+
+ /*
+ * The connection is not yet in the hashtable, so we update it.
+ * CIP->VIP will remain the same, so leave the tuple in
+ * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
+ * real-server we will see RIP->DIP.
+ */
+ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ /*
+ * This will also take care of UDP and other protocols.
+ */
+ if (outin) {
+ new_tuple.src.u3 = cp->daddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.src.u.tcp.port = cp->dport;
+ } else {
+ new_tuple.dst.u3 = cp->vaddr;
+ if (new_tuple.dst.protonum != IPPROTO_ICMP &&
+ new_tuple.dst.protonum != IPPROTO_ICMPV6)
+ new_tuple.dst.u.tcp.port = cp->vport;
+ }
+ IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE
+ ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
+ ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ nf_conntrack_alter_reply(ct, &new_tuple);
+}
+
+int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp)
+{
+ return nf_conntrack_confirm(skb);
+}
+
+/*
+ * Called from init_conntrack() as expectfn handler.
+ */
+static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
+ struct nf_conntrack_expect *exp)
+{
+ struct nf_conntrack_tuple *orig, new_reply;
+ struct ip_vs_conn *cp;
+
+ if (exp->tuple.src.l3num != PF_INET)
+ return;
+
+ /*
+ * We assume that no NF locks are held before this callback.
+ * ip_vs_conn_out_get and ip_vs_conn_in_get should match their
+ * expectations even if they use wildcard values, now we provide the
+ * actual values from the newly created original conntrack direction.
+ * The conntrack is confirmed when packet reaches IPVS hooks.
+ */
+
+ /* RS->CLIENT */
+ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port);
+ if (cp) {
+ /* Change reply CLIENT->RS to CLIENT->VS */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.dst.u3 = cp->vaddr;
+ new_reply.dst.u.tcp.port = cp->vport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
+ ", inout cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ /* CLIENT->VS */
+ cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum,
+ &orig->src.u3, orig->src.u.tcp.port,
+ &orig->dst.u3, orig->dst.u.tcp.port);
+ if (cp) {
+ /* Change reply VS->CLIENT to RS->CLIENT */
+ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
+ __func__, ct, ct->status,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ new_reply.src.u3 = cp->daddr;
+ new_reply.src.u.tcp.port = cp->dport;
+ IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
+ FMT_TUPLE ", outin cp=" FMT_CONN "\n",
+ __func__, ct,
+ ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
+ ARG_CONN(cp));
+ goto alter;
+ }
+
+ IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
+ return;
+
+alter:
+ /* Never alter conntrack for non-NAT conns */
+ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ)
+ nf_conntrack_alter_reply(ct, &new_reply);
+ ip_vs_conn_put(cp);
+ return;
+}
+
+/*
+ * Create NF conntrack expectation with wildcard (optional) source port.
+ * Then the default callback function will alter the reply and will confirm
+ * the conntrack entry when the first packet comes.
+ * Use port 0 to expect connection from any port.
+ */
+void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
+ struct ip_vs_conn *cp, u_int8_t proto,
+ const __be16 port, int from_rs)
+{
+ struct nf_conntrack_expect *exp;
+
+ if (ct == NULL || nf_ct_is_untracked(ct))
+ return;
+
+ exp = nf_ct_expect_alloc(ct);
+ if (!exp)
+ return;
+
+ nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct),
+ from_rs ? &cp->daddr : &cp->caddr,
+ from_rs ? &cp->caddr : &cp->vaddr,
+ proto, port ? &port : NULL,
+ from_rs ? &cp->cport : &cp->vport);
+
+ exp->expectfn = ip_vs_nfct_expect_callback;
+
+ IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
+ nf_ct_expect_related(exp);
+ nf_ct_expect_put(exp);
+}
+EXPORT_SYMBOL(ip_vs_nfct_expect_related);
+
+/*
+ * Our connection was terminated, try to drop the conntrack immediately
+ */
+void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct nf_conntrack_tuple tuple;
+
+ if (!cp->cport)
+ return;
+
+ tuple = (struct nf_conntrack_tuple) {
+ .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } };
+ tuple.src.u3 = cp->caddr;
+ tuple.src.u.all = cp->cport;
+ tuple.src.l3num = cp->af;
+ tuple.dst.u3 = cp->vaddr;
+ tuple.dst.u.all = cp->vport;
+
+ IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
+ " for conn " FMT_CONN "\n",
+ __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+
+ h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple);
+ if (h) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* Show what happens instead of calling nf_ct_kill() */
+ if (del_timer(&ct->timeout)) {
+ IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ if (ct->timeout.function)
+ ct->timeout.function(ct->timeout.data);
+ } else {
+ IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
+ }
+ nf_ct_put(ct);
+ } else {
+ IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
+ }
+}
+
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 49df6bea6a2d..8817afa34e6a 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -28,7 +28,6 @@
#include <net/ip6_route.h>
#include <linux/icmpv6.h>
#include <linux/netfilter.h>
-#include <net/netfilter/nf_conntrack.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
@@ -194,12 +193,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
dst_release(old_dst);
}
-#define IP_VS_XMIT(pf, skb, rt) \
+#define IP_VS_XMIT_TUNNEL(skb, cp) \
+({ \
+ int __ret = NF_ACCEPT; \
+ \
+ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \
+ __ret = ip_vs_confirm_conntrack(skb, cp); \
+ if (__ret == NF_ACCEPT) { \
+ nf_reset(skb); \
+ (skb)->ip_summed = CHECKSUM_NONE; \
+ } \
+ __ret; \
+})
+
+#define IP_VS_XMIT_NAT(pf, skb, cp) \
do { \
- (skb)->ipvs_property = 1; \
+ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+ (skb)->ipvs_property = 1; \
+ else \
+ ip_vs_update_conntrack(skb, cp, 1); \
skb_forward_csum(skb); \
NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
- (rt)->dst.dev, dst_output); \
+ skb_dst(skb)->dev, dst_output); \
+} while (0)
+
+#define IP_VS_XMIT(pf, skb, cp) \
+do { \
+ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \
+ (skb)->ipvs_property = 1; \
+ skb_forward_csum(skb); \
+ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \
+ skb_dst(skb)->dev, dst_output); \
} while (0)
@@ -271,7 +295,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
LeaveFunction(10);
return NF_STOLEN;
@@ -335,7 +359,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
LeaveFunction(10);
return NF_STOLEN;
@@ -349,36 +373,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
}
#endif
-void
-ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
-{
- struct nf_conn *ct = (struct nf_conn *)skb->nfct;
- struct nf_conntrack_tuple new_tuple;
-
- if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct))
- return;
-
- /*
- * The connection is not yet in the hashtable, so we update it.
- * CIP->VIP will remain the same, so leave the tuple in
- * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the
- * real-server we will see RIP->DIP.
- */
- new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- if (outin)
- new_tuple.src.u3 = cp->daddr;
- else
- new_tuple.dst.u3 = cp->vaddr;
- /*
- * This will also take care of UDP and other protocols.
- */
- if (outin)
- new_tuple.src.u.tcp.port = cp->dport;
- else
- new_tuple.dst.u.tcp.port = cp->vport;
- nf_conntrack_alter_reply(ct, &new_tuple);
-}
-
/*
* NAT transmitter (only for outside-to-inside nat forwarding)
* Not used for related ICMP
@@ -434,8 +428,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
- ip_vs_update_conntrack(skb, cp, 1);
-
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
@@ -443,7 +435,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp);
LeaveFunction(10);
return NF_STOLEN;
@@ -451,8 +443,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
- LeaveFunction(10);
kfree_skb(skb);
+ LeaveFunction(10);
return NF_STOLEN;
tx_error_put:
ip_rt_put(rt);
@@ -512,8 +504,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
- ip_vs_update_conntrack(skb, cp, 1);
-
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
@@ -521,7 +511,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp);
LeaveFunction(10);
return NF_STOLEN;
@@ -571,6 +561,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
+ int ret;
EnterFunction(10);
@@ -655,7 +646,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ip_local_out(skb);
+ ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
LeaveFunction(10);
@@ -681,6 +676,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ipv6hdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
int mtu;
+ int ret;
EnterFunction(10);
@@ -761,7 +757,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- ip6_local_out(skb);
+ ret = IP_VS_XMIT_TUNNEL(skb, cp);
+ if (ret == NF_ACCEPT)
+ ip6_local_out(skb);
+ else if (ret == NF_DROP)
+ kfree_skb(skb);
LeaveFunction(10);
@@ -820,7 +820,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
LeaveFunction(10);
return NF_STOLEN;
@@ -873,7 +873,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
LeaveFunction(10);
return NF_STOLEN;
@@ -947,7 +947,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV4, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV4, skb, cp);
rc = NF_STOLEN;
goto out;
@@ -1022,7 +1022,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
- IP_VS_XMIT(NFPROTO_IPV6, skb, rt);
+ IP_VS_XMIT(NFPROTO_IPV6, skb, cp);
rc = NF_STOLEN;
goto out;