diff options
-rw-r--r-- | include/linux/ip_vs.h | 2 | ||||
-rw-r--r-- | include/net/ip_vs.h | 44 | ||||
-rw-r--r-- | net/netfilter/ipvs/Kconfig | 13 | ||||
-rw-r--r-- | net/netfilter/ipvs/Makefile | 5 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_conn.c | 13 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_core.c | 46 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ctl.c | 12 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_ftp.c | 146 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_nfct.c | 292 | ||||
-rw-r--r-- | net/netfilter/ipvs/ip_vs_xmit.c | 98 |
10 files changed, 475 insertions, 196 deletions
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h index 003d75f6ffe1..df7728613720 100644 --- a/include/linux/ip_vs.h +++ b/include/linux/ip_vs.h @@ -90,10 +90,12 @@ #define IP_VS_CONN_F_ONE_PACKET 0x2000 /* forward only one packet */ /* Flags that are not sent to backup server start from bit 16 */ +#define IP_VS_CONN_F_NFCT (1 << 16) /* use netfilter conntrack */ /* Connection flags from destination that can be changed by user space */ #define IP_VS_CONN_F_DEST_MASK (IP_VS_CONN_F_FWD_MASK | \ IP_VS_CONN_F_ONE_PACKET | \ + IP_VS_CONN_F_NFCT | \ 0) #define IP_VS_SCHEDNAME_MAXLEN 16 diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 62698a9c1631..e8ec5231eae9 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -25,7 +25,9 @@ #include <linux/ip.h> #include <linux/ipv6.h> /* for struct ipv6hdr */ #include <net/ipv6.h> /* for ipv6_addr_copy */ - +#ifdef CONFIG_IP_VS_NFCT +#include <net/netfilter/nf_conntrack.h> +#endif /* Connections' size value needed by ip_vs_ctl.c */ extern int ip_vs_conn_tab_size; @@ -798,6 +800,7 @@ extern int sysctl_ip_vs_expire_nodest_conn; extern int sysctl_ip_vs_expire_quiescent_template; extern int sysctl_ip_vs_sync_threshold[2]; extern int sysctl_ip_vs_nat_icmp_send; +extern int sysctl_ip_vs_conntrack; extern struct ip_vs_stats ip_vs_stats; extern const struct ctl_path net_vs_ctl_path[]; @@ -955,8 +958,47 @@ static inline __wsum ip_vs_check_diff2(__be16 old, __be16 new, __wsum oldsum) return csum_partial(diff, sizeof(diff), oldsum); } +#ifdef CONFIG_IP_VS_NFCT +/* + * Netfilter connection tracking + * (from ip_vs_nfct.c) + */ +static inline int ip_vs_conntrack_enabled(void) +{ + return sysctl_ip_vs_conntrack; +} + extern void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin); +extern int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp); +extern void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs); +extern void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp); + +#else + +static inline int ip_vs_conntrack_enabled(void) +{ + return 0; +} + +static inline void ip_vs_update_conntrack(struct sk_buff *skb, + struct ip_vs_conn *cp, int outin) +{ +} + +static inline int ip_vs_confirm_conntrack(struct sk_buff *skb, + struct ip_vs_conn *cp) +{ + return NF_ACCEPT; +} + +static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ +} +/* CONFIG_IP_VS_NFCT */ +#endif #endif /* __KERNEL__ */ diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 46a77d5c3887..af3c9f48f2d7 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -3,7 +3,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER && NF_CONNTRACK + depends on NET && INET && NETFILTER ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This @@ -235,7 +235,8 @@ comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP && NF_NAT + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + select IP_VS_NFCT ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, @@ -247,4 +248,12 @@ config IP_VS_FTP If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index e3baefd7066e..349fe8819b89 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) # IPVS core diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9fe1da7bcf16..a970d9691496 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -721,6 +721,9 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); + if (cp->flags & IP_VS_CONN_F_NFCT) + ip_vs_conn_drop_conntrack(cp); + if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); @@ -816,6 +819,16 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, if (unlikely(pp && atomic_read(&pp->appcnt))) ip_vs_bind_app(cp, pp); + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled()) + cp->flags |= IP_VS_CONN_F_NFCT; + /* Hash it in the ip_vs_conn_tab finally */ ip_vs_conn_hash(cp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 319991d4d251..7fbc80d81fe8 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -537,6 +537,23 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING + * chain and is used to avoid double NAT and confirmation when we do + * not want to keep the conntrack structure + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!skb->ipvs_property) + return NF_ACCEPT; + /* The packet was sent from IPVS, exit this chain */ + return NF_STOP; +} + __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -695,7 +712,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb, /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + skb->ipvs_property = 1; + else + ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: @@ -928,17 +948,19 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_update_conntrack(skb, cp, 0); + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + skb->ipvs_property = 1; + else + ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); - skb->ipvs_property = 1; - LeaveFunction(11); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); + LeaveFunction(11); return NF_STOLEN; } @@ -1483,6 +1505,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1511,6 +1541,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7bd41d28080c..d2d842f292c6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -88,6 +88,9 @@ int sysctl_ip_vs_expire_nodest_conn = 0; int sysctl_ip_vs_expire_quiescent_template = 0; int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; int sysctl_ip_vs_nat_icmp_send = 0; +#ifdef CONFIG_IP_VS_NFCT +int sysctl_ip_vs_conntrack; +#endif #ifdef CONFIG_IP_VS_DEBUG @@ -1580,6 +1583,15 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_do_defense_mode, }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .data = &sysctl_ip_vs_conntrack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .procname = "secure_tcp", .data = &sysctl_ip_vs_secure_tcp, diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 7e9af5b76d9e..9cd375f94d61 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -20,17 +20,6 @@ * * Author: Wouter Gadeyne * - * - * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from - * http://www.ssi.bg/~ja/nfct/: - * - * ip_vs_nfct.c: Netfilter connection tracking support for IPVS - * - * Portions Copyright (C) 2001-2002 - * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. - * - * Portions Copyright (C) 2003-2008 - * Julian Anastasov */ #define KMSG_COMPONENT "IPVS" @@ -58,16 +47,6 @@ #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " -#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" -#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ - &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ - (T)->dst.protonum - -#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" -#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ - &((C)->vaddr.ip), ntohs((C)->vport), \ - &((C)->daddr.ip), ntohs((C)->dport), \ - (C)->protocol, (C)->state /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv; static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; return 0; } @@ -149,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, } /* - * Called from init_conntrack() as expectfn handler. - */ -static void -ip_vs_expect_callback(struct nf_conn *ct, - struct nf_conntrack_expect *exp) -{ - struct nf_conntrack_tuple *orig, new_reply; - struct ip_vs_conn *cp; - - if (exp->tuple.src.l3num != PF_INET) - return; - - /* - * We assume that no NF locks are held before this callback. - * ip_vs_conn_out_get and ip_vs_conn_in_get should match their - * expectations even if they use wildcard values, now we provide the - * actual values from the newly created original conntrack direction. - * The conntrack is confirmed when packet reaches IPVS hooks. - */ - - /* RS->CLIENT */ - orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); - if (cp) { - /* Change reply CLIENT->RS to CLIENT->VS */ - new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found inout cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - new_reply.dst.u3 = cp->vaddr; - new_reply.dst.u.tcp.port = cp->vport; - IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE - ", inout cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - goto alter; - } - - /* CLIENT->VS */ - cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); - if (cp) { - /* Change reply VS->CLIENT to RS->CLIENT */ - new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found outin cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - new_reply.src.u3 = cp->daddr; - new_reply.src.u.tcp.port = cp->dport; - IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " - FMT_TUPLE ", outin cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - goto alter; - } - - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE - " - unknown expect\n", - __func__, ct, ct->status, ARG_TUPLE(orig)); - return; - -alter: - /* Never alter conntrack for non-NAT conns */ - if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) - nf_conntrack_alter_reply(ct, &new_reply); - ip_vs_conn_put(cp); - return; -} - -/* - * Create NF conntrack expectation with wildcard (optional) source port. - * Then the default callback function will alter the reply and will confirm - * the conntrack entry when the first packet comes. - */ -static void -ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct, - struct ip_vs_conn *cp, u_int8_t proto, - const __be16 *port, int from_rs) -{ - struct nf_conntrack_expect *exp; - - BUG_ON(!ct || ct == &nf_conntrack_untracked); - - exp = nf_ct_expect_alloc(ct); - if (!exp) - return; - - if (from_rs) - nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, - nf_ct_l3num(ct), &cp->daddr, &cp->caddr, - proto, port, &cp->cport); - else - nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, - nf_ct_l3num(ct), &cp->caddr, &cp->vaddr, - proto, port, &cp->vport); - - exp->expectfn = ip_vs_expect_callback; - - IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&exp->tuple)); - nf_ct_expect_related(exp); - nf_ct_expect_put(exp); -} - -/* * Look at outgoing ftp packets to catch the response to a PASV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, @@ -335,7 +202,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, &cp->caddr, 0, &cp->vaddr, port, &from, port, - IP_VS_CONN_F_NO_CPORT, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, cp->dest); if (!n_cp) return 0; @@ -371,8 +239,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, start-data, end-start, buf, buf_len); if (ret) - ip_vs_expect_related(skb, ct, n_cp, - IPPROTO_TCP, NULL, 0); + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); } /* @@ -487,7 +355,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &cp->daddr, htons(ntohs(cp->dport)-1), - 0, + IP_VS_CONN_F_NFCT, cp->dest); if (!n_cp) return 0; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 000000000000..c038458d0290 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,292 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Authors: + * Ben North <ben@redfrontdoor.org> + * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels + * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/compiler.h> +#include <linux/vmalloc.h> +#include <linux/skbuff.h> +#include <net/ip.h> +#include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> +#include <net/ip_vs.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_zones.h> + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 49df6bea6a2d..8817afa34e6a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -28,7 +28,6 @@ #include <net/ip6_route.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> -#include <net/netfilter/nf_conntrack.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> @@ -194,12 +193,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) dst_release(old_dst); } -#define IP_VS_XMIT(pf, skb, rt) \ +#define IP_VS_XMIT_TUNNEL(skb, cp) \ +({ \ + int __ret = NF_ACCEPT; \ + \ + if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ + __ret = ip_vs_confirm_conntrack(skb, cp); \ + if (__ret == NF_ACCEPT) { \ + nf_reset(skb); \ + (skb)->ip_summed = CHECKSUM_NONE; \ + } \ + __ret; \ +}) + +#define IP_VS_XMIT_NAT(pf, skb, cp) \ do { \ - (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + (skb)->ipvs_property = 1; \ + else \ + ip_vs_update_conntrack(skb, cp, 1); \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->dst.dev, dst_output); \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + +#define IP_VS_XMIT(pf, skb, cp) \ +do { \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + (skb)->ipvs_property = 1; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ } while (0) @@ -271,7 +295,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -335,7 +359,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -349,36 +373,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif -void -ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) -{ - struct nf_conn *ct = (struct nf_conn *)skb->nfct; - struct nf_conntrack_tuple new_tuple; - - if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct)) - return; - - /* - * The connection is not yet in the hashtable, so we update it. - * CIP->VIP will remain the same, so leave the tuple in - * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the - * real-server we will see RIP->DIP. - */ - new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - if (outin) - new_tuple.src.u3 = cp->daddr; - else - new_tuple.dst.u3 = cp->vaddr; - /* - * This will also take care of UDP and other protocols. - */ - if (outin) - new_tuple.src.u.tcp.port = cp->dport; - else - new_tuple.dst.u.tcp.port = cp->vport; - nf_conntrack_alter_reply(ct, &new_tuple); -} - /* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP @@ -434,8 +428,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp, 1); - /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -443,7 +435,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -451,8 +443,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, tx_error_icmp: dst_link_failure(skb); tx_error: - LeaveFunction(10); kfree_skb(skb); + LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); @@ -512,8 +504,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp, 1); - /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -521,7 +511,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -571,6 +561,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; + int ret; EnterFunction(10); @@ -655,7 +646,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ip_local_out(skb); + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); LeaveFunction(10); @@ -681,6 +676,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; + int ret; EnterFunction(10); @@ -761,7 +757,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ip6_local_out(skb); + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); LeaveFunction(10); @@ -820,7 +820,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -873,7 +873,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -947,7 +947,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); rc = NF_STOLEN; goto out; @@ -1022,7 +1022,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); rc = NF_STOLEN; goto out; |