diff options
Diffstat (limited to 'net/ipv4/ipvs')
26 files changed, 0 insertions, 12109 deletions
diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig deleted file mode 100644 index 09d0c3f35669..000000000000 --- a/net/ipv4/ipvs/Kconfig +++ /dev/null @@ -1,224 +0,0 @@ -# -# IP Virtual Server configuration -# -menuconfig IP_VS - tristate "IP virtual server support (EXPERIMENTAL)" - depends on NETFILTER - ---help--- - IP Virtual Server support will let you build a high-performance - virtual server based on cluster of two or more real servers. This - option must be enabled for at least one of the clustered computers - that will take care of intercepting incoming connections to a - single IP address and scheduling them to real servers. - - Three request dispatching techniques are implemented, they are - virtual server via NAT, virtual server via tunneling and virtual - server via direct routing. The several scheduling algorithms can - be used to choose which server the connection is directed to, - thus load balancing can be achieved among the servers. For more - information and its administration program, please visit the - following URL: <http://www.linuxvirtualserver.org/>. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -if IP_VS - -config IP_VS_DEBUG - bool "IP virtual server debugging" - ---help--- - Say Y here if you want to get additional messages useful in - debugging the IP virtual server code. You can change the debug - level in /proc/sys/net/ipv4/vs/debug_level - -config IP_VS_TAB_BITS - int "IPVS connection table size (the Nth power of 2)" - default "12" - ---help--- - The IPVS connection hash table uses the chaining scheme to handle - hash collisions. Using a big IPVS connection hash table will greatly - reduce conflicts when there are hundreds of thousands of connections - in the hash table. - - Note the table size must be power of 2. The table size will be the - value of 2 to the your input number power. The number to choose is - from 8 to 20, the default number is 12, which means the table size - is 4096. Don't input the number too small, otherwise you will lose - performance on it. You can adapt the table size yourself, according - to your virtual server application. It is good to set the table size - not far less than the number of connections per second multiplying - average lasting time of connection in the table. For example, your - virtual server gets 200 connections per second, the connection lasts - for 200 seconds in average in the connection table, the table size - should be not far less than 200x200, it is good to set the table - size 32768 (2**15). - - Another note that each connection occupies 128 bytes effectively and - each hash entry uses 8 bytes, so you can estimate how much memory is - needed for your box. - -comment "IPVS transport protocol load balancing support" - -config IP_VS_PROTO_TCP - bool "TCP load balancing support" - ---help--- - This option enables support for load balancing TCP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_UDP - bool "UDP load balancing support" - ---help--- - This option enables support for load balancing UDP transport - protocol. Say Y if unsure. - -config IP_VS_PROTO_ESP - bool "ESP load balancing support" - ---help--- - This option enables support for load balancing ESP (Encapsulation - Security Payload) transport protocol. Say Y if unsure. - -config IP_VS_PROTO_AH - bool "AH load balancing support" - ---help--- - This option enables support for load balancing AH (Authentication - Header) transport protocol. Say Y if unsure. - -comment "IPVS scheduler" - -config IP_VS_RR - tristate "round-robin scheduling" - ---help--- - The robin-robin scheduling algorithm simply directs network - connections to different real servers in a round-robin manner. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WRR - tristate "weighted round-robin scheduling" - ---help--- - The weighted robin-robin scheduling algorithm directs network - connections to different real servers based on server weights - in a round-robin manner. Servers with higher weights receive - new connections first than those with less weights, and servers - with higher weights get more connections than those with less - weights and servers with equal weights get equal connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LC - tristate "least-connection scheduling" - ---help--- - The least-connection scheduling algorithm directs network - connections to the server with the least number of active - connections. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_WLC - tristate "weighted least-connection scheduling" - ---help--- - The weighted least-connection scheduling algorithm directs network - connections to the server with the least active connections - normalized by the server weight. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLC - tristate "locality-based least-connection scheduling" - ---help--- - The locality-based least-connection scheduling algorithm is for - destination IP load balancing. It is usually used in cache cluster. - This algorithm usually directs packet destined for an IP address to - its server if the server is alive and under load. If the server is - overloaded (its active connection numbers is larger than its weight) - and there is a server in its half load, then allocate the weighted - least-connection server to this IP address. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_LBLCR - tristate "locality-based least-connection with replication scheduling" - ---help--- - The locality-based least-connection with replication scheduling - algorithm is also for destination IP load balancing. It is - usually used in cache cluster. It differs from the LBLC scheduling - as follows: the load balancer maintains mappings from a target - to a set of server nodes that can serve the target. Requests for - a target are assigned to the least-connection node in the target's - server set. If all the node in the server set are over loaded, - it picks up a least-connection node in the cluster and adds it - in the sever set for the target. If the server set has not been - modified for the specified time, the most loaded node is removed - from the server set, in order to avoid high degree of replication. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_DH - tristate "destination hashing scheduling" - ---help--- - The destination hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their destination IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SH - tristate "source hashing scheduling" - ---help--- - The source hashing scheduling algorithm assigns network - connections to the servers through looking up a statically assigned - hash table by their source IP addresses. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_SED - tristate "shortest expected delay scheduling" - ---help--- - The shortest expected delay scheduling algorithm assigns network - connections to the server with the shortest expected delay. The - expected delay that the job will experience is (Ci + 1) / Ui if - sent to the ith server, in which Ci is the number of connections - on the ith server and Ui is the fixed service rate (weight) - of the ith server. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -config IP_VS_NQ - tristate "never queue scheduling" - ---help--- - The never queue scheduling algorithm adopts a two-speed model. - When there is an idle server available, the job will be sent to - the idle server, instead of waiting for a fast one. When there - is no idle server available, the job will be sent to the server - that minimize its expected delay (The Shortest Expected Delay - scheduling algorithm). - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -comment 'IPVS application helper' - -config IP_VS_FTP - tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP - ---help--- - FTP is a protocol that transfers IP address and/or port number in - the payload. In the virtual server via Network Address Translation, - the IP address and port number of real servers cannot be sent to - clients in ftp connections directly, so FTP protocol helper is - required for tracking the connection and mangling it back to that of - virtual service. - - If you want to compile it in kernel, say Y. To compile it as a - module, choose M here. If unsure, say N. - -endif # IP_VS diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile deleted file mode 100644 index 30e85de9ffff..000000000000 --- a/net/ipv4/ipvs/Makefile +++ /dev/null @@ -1,34 +0,0 @@ -# -# Makefile for the IPVS modules on top of IPv4. -# - -# IPVS transport protocol load balancing support -ip_vs_proto-objs-y := -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o -ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o - -ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ - ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) - - -# IPVS core -obj-$(CONFIG_IP_VS) += ip_vs.o - -# IPVS schedulers -obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o -obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o -obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o -obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o -obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o -obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o -obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o -obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o -obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o -obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o - -# IPVS application helpers -obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c deleted file mode 100644 index 201b8ea3020d..000000000000 --- a/net/ipv4/ipvs/ip_vs_app.c +++ /dev/null @@ -1,622 +0,0 @@ -/* - * ip_vs_app.c: Application module support for IPVS - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference - * is that ip_vs_app module handles the reverse direction (incoming requests - * and outgoing responses). - * - * IP_MASQ_APP application masquerading module - * - * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/net_namespace.h> -#include <net/protocol.h> -#include <net/tcp.h> -#include <asm/system.h> -#include <linux/stat.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -#include <net/ip_vs.h> - -EXPORT_SYMBOL(register_ip_vs_app); -EXPORT_SYMBOL(unregister_ip_vs_app); -EXPORT_SYMBOL(register_ip_vs_app_inc); - -/* ipvs application list head */ -static LIST_HEAD(ip_vs_app_list); -static DEFINE_MUTEX(__ip_vs_app_mutex); - - -/* - * Get an ip_vs_app object - */ -static inline int ip_vs_app_get(struct ip_vs_app *app) -{ - return try_module_get(app->module); -} - - -static inline void ip_vs_app_put(struct ip_vs_app *app) -{ - module_put(app->module); -} - - -/* - * Allocate/initialize app incarnation and register it in proto apps. - */ -static int -ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - struct ip_vs_protocol *pp; - struct ip_vs_app *inc; - int ret; - - if (!(pp = ip_vs_proto_get(proto))) - return -EPROTONOSUPPORT; - - if (!pp->unregister_app) - return -EOPNOTSUPP; - - inc = kmemdup(app, sizeof(*inc), GFP_KERNEL); - if (!inc) - return -ENOMEM; - INIT_LIST_HEAD(&inc->p_list); - INIT_LIST_HEAD(&inc->incs_list); - inc->app = app; - inc->port = htons(port); - atomic_set(&inc->usecnt, 0); - - if (app->timeouts) { - inc->timeout_table = - ip_vs_create_timeout_table(app->timeouts, - app->timeouts_size); - if (!inc->timeout_table) { - ret = -ENOMEM; - goto out; - } - } - - ret = pp->register_app(inc); - if (ret) - goto out; - - list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); - - return 0; - - out: - kfree(inc->timeout_table); - kfree(inc); - return ret; -} - - -/* - * Release app incarnation - */ -static void -ip_vs_app_inc_release(struct ip_vs_app *inc) -{ - struct ip_vs_protocol *pp; - - if (!(pp = ip_vs_proto_get(inc->protocol))) - return; - - if (pp->unregister_app) - pp->unregister_app(inc); - - IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); - - list_del(&inc->a_list); - - kfree(inc->timeout_table); - kfree(inc); -} - - -/* - * Get reference to app inc (only called from softirq) - * - */ -int ip_vs_app_inc_get(struct ip_vs_app *inc) -{ - int result; - - atomic_inc(&inc->usecnt); - if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) - atomic_dec(&inc->usecnt); - return result; -} - - -/* - * Put the app inc (only called from timer or net softirq) - */ -void ip_vs_app_inc_put(struct ip_vs_app *inc) -{ - ip_vs_app_put(inc->app); - atomic_dec(&inc->usecnt); -} - - -/* - * Register an application incarnation in protocol applications - */ -int -register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) -{ - int result; - - mutex_lock(&__ip_vs_app_mutex); - - result = ip_vs_app_inc_new(app, proto, port); - - mutex_unlock(&__ip_vs_app_mutex); - - return result; -} - - -/* - * ip_vs_app registration routine - */ -int register_ip_vs_app(struct ip_vs_app *app) -{ - /* increase the module use count */ - ip_vs_use_count_inc(); - - mutex_lock(&__ip_vs_app_mutex); - - list_add(&app->a_list, &ip_vs_app_list); - - mutex_unlock(&__ip_vs_app_mutex); - - return 0; -} - - -/* - * ip_vs_app unregistration routine - * We are sure there are no app incarnations attached to services - */ -void unregister_ip_vs_app(struct ip_vs_app *app) -{ - struct ip_vs_app *inc, *nxt; - - mutex_lock(&__ip_vs_app_mutex); - - list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { - ip_vs_app_inc_release(inc); - } - - list_del(&app->a_list); - - mutex_unlock(&__ip_vs_app_mutex); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - - -/* - * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) - */ -int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) -{ - return pp->app_conn_bind(cp); -} - - -/* - * Unbind cp from application incarnation (called by cp destructor) - */ -void ip_vs_unbind_app(struct ip_vs_conn *cp) -{ - struct ip_vs_app *inc = cp->app; - - if (!inc) - return; - - if (inc->unbind_conn) - inc->unbind_conn(inc, cp); - if (inc->done_conn) - inc->done_conn(inc, cp); - ip_vs_app_inc_put(inc); - cp->app = NULL; -} - - -/* - * Fixes th->seq based on ip_vs_seq info. - */ -static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 seq = ntohl(th->seq); - - /* - * Adjust seq with delta-offset for all packets after - * the most recent resized pkt seq and with previous_delta offset - * for all packets before most recent resized pkt seq. - */ - if (vseq->delta || vseq->previous_delta) { - if(after(seq, vseq->init_seq)) { - th->seq = htonl(seq + vseq->delta); - IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", - vseq->delta); - } else { - th->seq = htonl(seq + vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " - "(%d) to seq\n", vseq->previous_delta); - } - } -} - - -/* - * Fixes th->ack_seq based on ip_vs_seq info. - */ -static inline void -vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) -{ - __u32 ack_seq = ntohl(th->ack_seq); - - /* - * Adjust ack_seq with delta-offset for - * the packets AFTER most recent resized pkt has caused a shift - * for packets before most recent resized pkt, use previous_delta - */ - if (vseq->delta || vseq->previous_delta) { - /* since ack_seq is the number of octet that is expected - to receive next, so compare it with init_seq+delta */ - if(after(ack_seq, vseq->init_seq+vseq->delta)) { - th->ack_seq = htonl(ack_seq - vseq->delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " - "(%d) from ack_seq\n", vseq->delta); - - } else { - th->ack_seq = htonl(ack_seq - vseq->previous_delta); - IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " - "previous_delta (%d) from ack_seq\n", - vseq->previous_delta); - } - } -} - - -/* - * Updates ip_vs_seq if pkt has been resized - * Assumes already checked proto==IPPROTO_TCP and diff!=0. - */ -static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, - unsigned flag, __u32 seq, int diff) -{ - /* spinlock is to keep updating cp->flags atomic */ - spin_lock(&cp->lock); - if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { - vseq->previous_delta = vseq->delta; - vseq->delta += diff; - vseq->init_seq = seq; - cp->flags |= flag; - } - spin_unlock(&cp->lock); -} - -static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_seq(&cp->out_seq, th); - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_ack_seq(&cp->in_seq, th); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - if (!app->pkt_out(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->out_seq, - IP_VS_CONN_F_OUT_SEQ, seq, diff); - - return 1; -} - -/* - * Output pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL - * returns false if it can't handle packet (oom) - */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); - - /* - * Call private output hook function - */ - if (app->pkt_out == NULL) - return 1; - - return app->pkt_out(app, cp, skb, NULL); -} - - -static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) -{ - int diff; - const unsigned int tcp_offset = ip_hdrlen(skb); - struct tcphdr *th; - __u32 seq; - - if (!skb_make_writable(skb, tcp_offset + sizeof(*th))) - return 0; - - th = (struct tcphdr *)(skb_network_header(skb) + tcp_offset); - - /* - * Remember seq number in case this pkt gets resized - */ - seq = ntohl(th->seq); - - /* - * Fix seq stuff if flagged as so. - */ - if (cp->flags & IP_VS_CONN_F_IN_SEQ) - vs_fix_seq(&cp->in_seq, th); - if (cp->flags & IP_VS_CONN_F_OUT_SEQ) - vs_fix_ack_seq(&cp->out_seq, th); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - if (!app->pkt_in(app, cp, skb, &diff)) - return 0; - - /* - * Update ip_vs seq stuff if len has changed. - */ - if (diff != 0) - vs_seq_update(cp, &cp->in_seq, - IP_VS_CONN_F_IN_SEQ, seq, diff); - - return 1; -} - -/* - * Input pkt hook. Will call bound ip_vs_app specific function - * called by ipvs packet handler, assumes previously checked cp!=NULL. - * returns false if can't handle packet (oom). - */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_app *app; - - /* - * check if application module is bound to - * this ip_vs_conn. - */ - if ((app = cp->app) == NULL) - return 1; - - /* TCP is complicated */ - if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); - - /* - * Call private input hook function - */ - if (app->pkt_in == NULL) - return 1; - - return app->pkt_in(app, cp, skb, NULL); -} - - -#ifdef CONFIG_PROC_FS -/* - * /proc/net/ip_vs_app entry function - */ - -static struct ip_vs_app *ip_vs_app_idx(loff_t pos) -{ - struct ip_vs_app *app, *inc; - - list_for_each_entry(app, &ip_vs_app_list, a_list) { - list_for_each_entry(inc, &app->incs_list, a_list) { - if (pos-- == 0) - return inc; - } - } - return NULL; - -} - -static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) -{ - mutex_lock(&__ip_vs_app_mutex); - - return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; -} - -static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_app *inc, *app; - struct list_head *e; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_app_idx(0); - - inc = v; - app = inc->app; - - if ((e = inc->a_list.next) != &app->incs_list) - return list_entry(e, struct ip_vs_app, a_list); - - /* go on to next application */ - for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { - app = list_entry(e, struct ip_vs_app, a_list); - list_for_each_entry(inc, &app->incs_list, a_list) { - return inc; - } - } - return NULL; -} - -static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) -{ - mutex_unlock(&__ip_vs_app_mutex); -} - -static int ip_vs_app_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) - seq_puts(seq, "prot port usecnt name\n"); - else { - const struct ip_vs_app *inc = v; - - seq_printf(seq, "%-3s %-7u %-6d %-17s\n", - ip_vs_proto_name(inc->protocol), - ntohs(inc->port), - atomic_read(&inc->usecnt), - inc->name); - } - return 0; -} - -static const struct seq_operations ip_vs_app_seq_ops = { - .start = ip_vs_app_seq_start, - .next = ip_vs_app_seq_next, - .stop = ip_vs_app_seq_stop, - .show = ip_vs_app_seq_show, -}; - -static int ip_vs_app_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_app_seq_ops); -} - -static const struct file_operations ip_vs_app_fops = { - .owner = THIS_MODULE, - .open = ip_vs_app_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - - -/* - * Replace a segment of data with a new segment - */ -int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri, - char *o_buf, int o_len, char *n_buf, int n_len) -{ - int diff; - int o_offset; - int o_left; - - EnterFunction(9); - - diff = n_len - o_len; - o_offset = o_buf - (char *)skb->data; - /* The length of left data after o_buf+o_len in the skb data */ - o_left = skb->len - (o_offset + o_len); - - if (diff <= 0) { - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - skb_trim(skb, skb->len + diff); - } else if (diff <= skb_tailroom(skb)) { - skb_put(skb, diff); - memmove(o_buf + n_len, o_buf + o_len, o_left); - memcpy(o_buf, n_buf, n_len); - } else { - if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) - return -ENOMEM; - skb_put(skb, diff); - memmove(skb->data + o_offset + n_len, - skb->data + o_offset + o_len, o_left); - skb_copy_to_linear_data_offset(skb, o_offset, n_buf, n_len); - } - - /* must update the iph total length here */ - ip_hdr(skb)->tot_len = htons(skb->len); - - LeaveFunction(9); - return 0; -} - - -int __init ip_vs_app_init(void) -{ - /* we will replace it with proc_net_ipvs_create() soon */ - proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); - return 0; -} - - -void ip_vs_app_cleanup(void) -{ - proc_net_remove(&init_net, "ip_vs_app"); -} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c deleted file mode 100644 index 44a6872dc245..000000000000 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ /dev/null @@ -1,1023 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. Many code here is taken from IP MASQ code of kernel 2.2. - * - * Changes: - * - */ - -#include <linux/interrupt.h> -#include <linux/in.h> -#include <linux/net.h> -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/vmalloc.h> -#include <linux/proc_fs.h> /* for proc_net_* */ -#include <linux/seq_file.h> -#include <linux/jhash.h> -#include <linux/random.h> - -#include <net/net_namespace.h> -#include <net/ip_vs.h> - - -/* - * Connection hash table: for input and output packets lookups of IPVS - */ -static struct list_head *ip_vs_conn_tab; - -/* SLAB cache for IPVS connections */ -static struct kmem_cache *ip_vs_conn_cachep __read_mostly; - -/* counter for current IPVS connections */ -static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); - -/* counter for no client port connections */ -static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); - -/* random value for IPVS connection hash */ -static unsigned int ip_vs_conn_rnd; - -/* - * Fine locking granularity for big connection hash table - */ -#define CT_LOCKARRAY_BITS 4 -#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) -#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) - -struct ip_vs_aligned_lock -{ - rwlock_t l; -} __attribute__((__aligned__(SMP_CACHE_BYTES))); - -/* lock array for conn table */ -static struct ip_vs_aligned_lock -__ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned; - -static inline void ct_read_lock(unsigned key) -{ - read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock(unsigned key) -{ - read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock(unsigned key) -{ - write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock(unsigned key) -{ - write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_lock_bh(unsigned key) -{ - read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_read_unlock_bh(unsigned key) -{ - read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_lock_bh(unsigned key) -{ - write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - -static inline void ct_write_unlock_bh(unsigned key) -{ - write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); -} - - -/* - * Returns hash value for IPVS connection entry - */ -static unsigned int ip_vs_conn_hashkey(unsigned proto, __be32 addr, __be16 port) -{ - return jhash_3words((__force u32)addr, (__force u32)port, proto, ip_vs_conn_rnd) - & IP_VS_CONN_TAB_MASK; -} - - -/* - * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. - * returns bool success. - */ -static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); - cp->flags |= IP_VS_CONN_F_HASHED; - atomic_inc(&cp->refcnt); - ret = 1; - } else { - IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - ret = 0; - } - - ct_write_unlock(hash); - - return ret; -} - - -/* - * UNhashes ip_vs_conn from ip_vs_conn_tab. - * returns bool success. - */ -static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) -{ - unsigned hash; - int ret; - - /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); - - ct_write_lock(hash); - - if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); - cp->flags &= ~IP_VS_CONN_F_HASHED; - atomic_dec(&cp->refcnt); - ret = 1; - } else - ret = 0; - - ct_write_unlock(hash); - - return ret; -} - - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) - */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol==cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ct_read_unlock(hash); - return cp; - } - } - - ct_read_unlock(hash); - - return NULL; -} - -struct ip_vs_conn *ip_vs_conn_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - struct ip_vs_conn *cp; - - cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); - - IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); - - return cp; -} - -/* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp; - - hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (s_addr==cp->caddr && s_port==cp->cport && - d_port==cp->vport && d_addr==cp->vaddr && - cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol==cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - goto out; - } - } - cp = NULL; - - out: - ct_read_unlock(hash); - - IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - cp?"hit":"not hit"); - - return cp; -} - -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int protocol, __be32 s_addr, __be16 s_port, __be32 d_addr, __be16 d_port) -{ - unsigned hash; - struct ip_vs_conn *cp, *ret=NULL; - - /* - * Check for "full" addressed entries - */ - hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); - - ct_read_lock(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (d_addr == cp->caddr && d_port == cp->cport && - s_port == cp->dport && s_addr == cp->daddr && - protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); - ret = cp; - break; - } - } - - ct_read_unlock(hash); - - IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", - ip_vs_proto_name(protocol), - NIPQUAD(s_addr), ntohs(s_port), - NIPQUAD(d_addr), ntohs(d_port), - ret?"hit":"not hit"); - - return ret; -} - - -/* - * Put back the conn and restart its timer with its timeout - */ -void ip_vs_conn_put(struct ip_vs_conn *cp) -{ - /* reset it expire in its timeout */ - mod_timer(&cp->timer, jiffies+cp->timeout); - - __ip_vs_conn_put(cp); -} - - -/* - * Fill a no_client_port connection with a client port number - */ -void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport) -{ - if (ip_vs_conn_unhash(cp)) { - spin_lock(&cp->lock); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) { - atomic_dec(&ip_vs_conn_no_cport_cnt); - cp->flags &= ~IP_VS_CONN_F_NO_CPORT; - cp->cport = cport; - } - spin_unlock(&cp->lock); - - /* hash on new dport */ - ip_vs_conn_hash(cp); - } -} - - -/* - * Bind a connection entry with the corresponding packet_xmit. - * Called by ip_vs_conn_new. - */ -static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) -{ - switch (IP_VS_FWD_METHOD(cp)) { - case IP_VS_CONN_F_MASQ: - cp->packet_xmit = ip_vs_nat_xmit; - break; - - case IP_VS_CONN_F_TUNNEL: - cp->packet_xmit = ip_vs_tunnel_xmit; - break; - - case IP_VS_CONN_F_DROUTE: - cp->packet_xmit = ip_vs_dr_xmit; - break; - - case IP_VS_CONN_F_LOCALNODE: - cp->packet_xmit = ip_vs_null_xmit; - break; - - case IP_VS_CONN_F_BYPASS: - cp->packet_xmit = ip_vs_bypass_xmit; - break; - } -} - - -static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) -{ - return atomic_read(&dest->activeconns) - + atomic_read(&dest->inactconns); -} - -/* - * Bind a connection entry with a virtual service destination - * Called just after a new connection entry is created. - */ -static inline void -ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) -{ - /* if dest is NULL, then return directly */ - if (!dest) - return; - - /* Increase the refcnt counter of the dest */ - atomic_inc(&dest->refcnt); - - /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) - /* if the connection is not template and is created - * by sync, preserve the activity flag. - */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); - cp->dest = dest; - - IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so increase the inactive - connection counter because it is in TCP SYNRECV - state (inactive) or other protocol inacive state */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_INACTIVE))) - atomic_inc(&dest->activeconns); - else - atomic_inc(&dest->inactconns); - } else { - /* It is a persistent connection/template, so increase - the peristent connection counter */ - atomic_inc(&dest->persistconns); - } - - if (dest->u_threshold != 0 && - ip_vs_dest_totalconns(dest) >= dest->u_threshold) - dest->flags |= IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Check if there is a destination for the connection, if so - * bind the connection to the destination. - */ -struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest; - - if ((cp) && (!cp->dest)) { - dest = ip_vs_find_dest(cp->daddr, cp->dport, - cp->vaddr, cp->vport, cp->protocol); - ip_vs_bind_dest(cp, dest); - return dest; - } else - return NULL; -} - - -/* - * Unbind a connection entry with its VS destination - * Called by the ip_vs_conn_expire function. - */ -static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) -{ - struct ip_vs_dest *dest = cp->dest; - - if (!dest) - return; - - IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " - "dest->refcnt:%d\n", - ip_vs_proto_name(cp->protocol), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), - ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); - - /* Update the connection counters */ - if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { - /* It is a normal connection, so decrease the inactconns - or activeconns counter */ - if (cp->flags & IP_VS_CONN_F_INACTIVE) { - atomic_dec(&dest->inactconns); - } else { - atomic_dec(&dest->activeconns); - } - } else { - /* It is a persistent connection/template, so decrease - the peristent connection counter */ - atomic_dec(&dest->persistconns); - } - - if (dest->l_threshold != 0) { - if (ip_vs_dest_totalconns(dest) < dest->l_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else if (dest->u_threshold != 0) { - if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } else { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - } - - /* - * Simply decrease the refcnt of the dest, because the - * dest will be either in service's destination list - * or in the trash. - */ - atomic_dec(&dest->refcnt); -} - - -/* - * Checking if the destination of a connection template is available. - * If available, return 1, otherwise invalidate this connection - * template and return 0. - */ -int ip_vs_check_template(struct ip_vs_conn *ct) -{ - struct ip_vs_dest *dest = ct->dest; - - /* - * Checking the dest server status. - */ - if ((dest == NULL) || - !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - (sysctl_ip_vs_expire_quiescent_template && - (atomic_read(&dest->weight) == 0))) { - IP_VS_DBG(9, "check_template: dest not available for " - "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "-> d:%u.%u.%u.%u:%d\n", - ip_vs_proto_name(ct->protocol), - NIPQUAD(ct->caddr), ntohs(ct->cport), - NIPQUAD(ct->vaddr), ntohs(ct->vport), - NIPQUAD(ct->daddr), ntohs(ct->dport)); - - /* - * Invalidate the connection template - */ - if (ct->vport != htons(0xffff)) { - if (ip_vs_conn_unhash(ct)) { - ct->dport = htons(0xffff); - ct->vport = htons(0xffff); - ct->cport = 0; - ip_vs_conn_hash(ct); - } - } - - /* - * Simply decrease the refcnt of the template, - * don't restart its timer. - */ - atomic_dec(&ct->refcnt); - return 0; - } - return 1; -} - -static void ip_vs_conn_expire(unsigned long data) -{ - struct ip_vs_conn *cp = (struct ip_vs_conn *)data; - - cp->timeout = 60*HZ; - - /* - * hey, I'm using it - */ - atomic_inc(&cp->refcnt); - - /* - * do I control anybody? - */ - if (atomic_read(&cp->n_control)) - goto expire_later; - - /* - * unhash it if it is hashed in the conn table - */ - if (!ip_vs_conn_unhash(cp)) - goto expire_later; - - /* - * refcnt==1 implies I'm the only one referrer - */ - if (likely(atomic_read(&cp->refcnt) == 1)) { - /* delete the timer if it is activated by other users */ - if (timer_pending(&cp->timer)) - del_timer(&cp->timer); - - /* does anybody control me? */ - if (cp->control) - ip_vs_control_del(cp); - - if (unlikely(cp->app != NULL)) - ip_vs_unbind_app(cp); - ip_vs_unbind_dest(cp); - if (cp->flags & IP_VS_CONN_F_NO_CPORT) - atomic_dec(&ip_vs_conn_no_cport_cnt); - atomic_dec(&ip_vs_conn_count); - - kmem_cache_free(ip_vs_conn_cachep, cp); - return; - } - - /* hash it back to the table */ - ip_vs_conn_hash(cp); - - expire_later: - IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt)-1, - atomic_read(&cp->n_control)); - - ip_vs_conn_put(cp); -} - - -void ip_vs_conn_expire_now(struct ip_vs_conn *cp) -{ - if (del_timer(&cp->timer)) - mod_timer(&cp->timer, jiffies); -} - - -/* - * Create a new connection entry and hash it into the ip_vs_conn_tab - */ -struct ip_vs_conn * -ip_vs_conn_new(int proto, __be32 caddr, __be16 cport, __be32 vaddr, __be16 vport, - __be32 daddr, __be16 dport, unsigned flags, - struct ip_vs_dest *dest) -{ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); - if (cp == NULL) { - IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); - return NULL; - } - - INIT_LIST_HEAD(&cp->c_list); - setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->protocol = proto; - cp->caddr = caddr; - cp->cport = cport; - cp->vaddr = vaddr; - cp->vport = vport; - cp->daddr = daddr; - cp->dport = dport; - cp->flags = flags; - spin_lock_init(&cp->lock); - - /* - * Set the entry is referenced by the current thread before hashing - * it in the table, so that other thread run ip_vs_random_dropentry - * but cannot drop this entry. - */ - atomic_set(&cp->refcnt, 1); - - atomic_set(&cp->n_control, 0); - atomic_set(&cp->in_pkts, 0); - - atomic_inc(&ip_vs_conn_count); - if (flags & IP_VS_CONN_F_NO_CPORT) - atomic_inc(&ip_vs_conn_no_cport_cnt); - - /* Bind the connection with a destination server */ - ip_vs_bind_dest(cp, dest); - - /* Set its state and timeout */ - cp->state = 0; - cp->timeout = 3*HZ; - - /* Bind its packet transmitter */ - ip_vs_bind_xmit(cp); - - if (unlikely(pp && atomic_read(&pp->appcnt))) - ip_vs_bind_app(cp, pp); - - /* Hash it in the ip_vs_conn_tab finally */ - ip_vs_conn_hash(cp); - - return cp; -} - - -/* - * /proc/net/ip_vs_conn entries - */ -#ifdef CONFIG_PROC_FS - -static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) -{ - int idx; - struct ip_vs_conn *cp; - - for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - if (pos-- == 0) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - } - ct_read_unlock_bh(idx); - } - - return NULL; -} - -static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) -{ - seq->private = NULL; - return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; -} - -static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct ip_vs_conn *cp = v; - struct list_head *e, *l = seq->private; - int idx; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_conn_array(seq, 0); - - /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); - - idx = l - ip_vs_conn_tab; - ct_read_unlock_bh(idx); - - while (++idx < IP_VS_CONN_TAB_SIZE) { - ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - seq->private = &ip_vs_conn_tab[idx]; - return cp; - } - ct_read_unlock_bh(idx); - } - seq->private = NULL; - return NULL; -} - -static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) -{ - struct list_head *l = seq->private; - - if (l) - ct_read_unlock_bh(l - ip_vs_conn_tab); -} - -static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); - else { - const struct ip_vs_conn *cp = v; - - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_seq_show, -}; - -static int ip_vs_conn_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_seq_ops); -} - -static const struct file_operations ip_vs_conn_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -static const char *ip_vs_origin_name(unsigned flags) -{ - if (flags & IP_VS_CONN_F_SYNC) - return "SYNC"; - else - return "LOCAL"; -} - -static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v) -{ - - if (v == SEQ_START_TOKEN) - seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); - else { - const struct ip_vs_conn *cp = v; - - seq_printf(seq, - "%-3s %08X %04X %08X %04X %08X %04X %-11s %-6s %7lu\n", - ip_vs_proto_name(cp->protocol), - ntohl(cp->caddr), ntohs(cp->cport), - ntohl(cp->vaddr), ntohs(cp->vport), - ntohl(cp->daddr), ntohs(cp->dport), - ip_vs_state_name(cp->protocol, cp->state), - ip_vs_origin_name(cp->flags), - (cp->timer.expires-jiffies)/HZ); - } - return 0; -} - -static const struct seq_operations ip_vs_conn_sync_seq_ops = { - .start = ip_vs_conn_seq_start, - .next = ip_vs_conn_seq_next, - .stop = ip_vs_conn_seq_stop, - .show = ip_vs_conn_sync_seq_show, -}; - -static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &ip_vs_conn_sync_seq_ops); -} - -static const struct file_operations ip_vs_conn_sync_fops = { - .owner = THIS_MODULE, - .open = ip_vs_conn_sync_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#endif - - -/* - * Randomly drop connection entries before running out of memory - */ -static inline int todrop_entry(struct ip_vs_conn *cp) -{ - /* - * The drop rate array needs tuning for real environments. - * Called from timer bh only => no locking - */ - static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; - static char todrop_counter[9] = {0}; - int i; - - /* if the conn entry hasn't lasted for 60 seconds, don't drop it. - This will leave enough time for normal connection to get - through. */ - if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) - return 0; - - /* Don't drop the entry if its number of incoming packets is not - located in [0, 8] */ - i = atomic_read(&cp->in_pkts); - if (i > 8 || i < 0) return 0; - - if (!todrop_rate[i]) return 0; - if (--todrop_counter[i] > 0) return 0; - - todrop_counter[i] = todrop_rate[i]; - return 1; -} - -/* Called from keventd and must protect itself from softirqs */ -void ip_vs_random_dropentry(void) -{ - int idx; - struct ip_vs_conn *cp; - - /* - * Randomly scan 1/32 of the whole table every second - */ - for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { - unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; - - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(hash); - - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; - - if (cp->protocol == IPPROTO_TCP) { - switch(cp->state) { - case IP_VS_TCP_S_SYN_RECV: - case IP_VS_TCP_S_SYNACK: - break; - - case IP_VS_TCP_S_ESTABLISHED: - if (todrop_entry(cp)) - break; - continue; - - default: - continue; - } - } else { - if (!todrop_entry(cp)) - continue; - } - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(hash); - } -} - - -/* - * Flush all the connection entries in the ip_vs_conn_tab - */ -static void ip_vs_conn_flush(void) -{ - int idx; - struct ip_vs_conn *cp; - - flush_again: - for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) { - /* - * Lock is actually needed in this loop. - */ - ct_write_lock_bh(idx); - - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { - - IP_VS_DBG(4, "del connection\n"); - ip_vs_conn_expire_now(cp); - if (cp->control) { - IP_VS_DBG(4, "del conn template\n"); - ip_vs_conn_expire_now(cp->control); - } - } - ct_write_unlock_bh(idx); - } - - /* the counter may be not NULL, because maybe some conn entries - are run by slow timer handler or unhashed but still referred */ - if (atomic_read(&ip_vs_conn_count) != 0) { - schedule(); - goto flush_again; - } -} - - -int __init ip_vs_conn_init(void) -{ - int idx; - - /* - * Allocate the connection hash table and initialize its list heads - */ - ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); - if (!ip_vs_conn_tab) - return -ENOMEM; - - /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!ip_vs_conn_cachep) { - vfree(ip_vs_conn_tab); - return -ENOMEM; - } - - IP_VS_INFO("Connection hash table configured " - "(size=%d, memory=%ldKbytes)\n", - IP_VS_CONN_TAB_SIZE, - (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); - IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", - sizeof(struct ip_vs_conn)); - - for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } - - for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { - rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); - } - - proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); - proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops); - - /* calculate the random value for connection hash */ - get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - - return 0; -} - - -void ip_vs_conn_cleanup(void) -{ - /* flush all the connection entries first */ - ip_vs_conn_flush(); - - /* Release the empty cache */ - kmem_cache_destroy(ip_vs_conn_cachep); - proc_net_remove(&init_net, "ip_vs_conn"); - proc_net_remove(&init_net, "ip_vs_conn_sync"); - vfree(ip_vs_conn_tab); -} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c deleted file mode 100644 index a7879eafc3b5..000000000000 --- a/net/ipv4/ipvs/ip_vs_core.c +++ /dev/null @@ -1,1125 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, - * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms - * and others. - * - * Changes: - * Paul `Rusty' Russell properly handle non-linear skbs - * Harald Welte don't use nfcache - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/ip.h> -#include <linux/tcp.h> -#include <linux/icmp.h> - -#include <net/ip.h> -#include <net/tcp.h> -#include <net/udp.h> -#include <net/icmp.h> /* for icmp_send */ -#include <net/route.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -EXPORT_SYMBOL(register_ip_vs_scheduler); -EXPORT_SYMBOL(unregister_ip_vs_scheduler); -EXPORT_SYMBOL(ip_vs_skb_replace); -EXPORT_SYMBOL(ip_vs_proto_name); -EXPORT_SYMBOL(ip_vs_conn_new); -EXPORT_SYMBOL(ip_vs_conn_in_get); -EXPORT_SYMBOL(ip_vs_conn_out_get); -#ifdef CONFIG_IP_VS_PROTO_TCP -EXPORT_SYMBOL(ip_vs_tcp_conn_listen); -#endif -EXPORT_SYMBOL(ip_vs_conn_put); -#ifdef CONFIG_IP_VS_DEBUG -EXPORT_SYMBOL(ip_vs_get_debug_level); -#endif - - -/* ID used in ICMP lookups */ -#define icmp_id(icmph) (((icmph)->un).echo.id) - -const char *ip_vs_proto_name(unsigned proto) -{ - static char buf[20]; - - switch (proto) { - case IPPROTO_IP: - return "IP"; - case IPPROTO_UDP: - return "UDP"; - case IPPROTO_TCP: - return "TCP"; - case IPPROTO_ICMP: - return "ICMP"; - default: - sprintf(buf, "IP_%d", proto); - return buf; - } -} - -void ip_vs_init_hash_table(struct list_head *table, int rows) -{ - while (--rows >= 0) - INIT_LIST_HEAD(&table[rows]); -} - -static inline void -ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.inpkts++; - dest->stats.inbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.inpkts++; - dest->svc->stats.inbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.inpkts++; - ip_vs_stats.inbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) -{ - struct ip_vs_dest *dest = cp->dest; - if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - spin_lock(&dest->stats.lock); - dest->stats.outpkts++; - dest->stats.outbytes += skb->len; - spin_unlock(&dest->stats.lock); - - spin_lock(&dest->svc->stats.lock); - dest->svc->stats.outpkts++; - dest->svc->stats.outbytes += skb->len; - spin_unlock(&dest->svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.outpkts++; - ip_vs_stats.outbytes += skb->len; - spin_unlock(&ip_vs_stats.lock); - } -} - - -static inline void -ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) -{ - spin_lock(&cp->dest->stats.lock); - cp->dest->stats.conns++; - spin_unlock(&cp->dest->stats.lock); - - spin_lock(&svc->stats.lock); - svc->stats.conns++; - spin_unlock(&svc->stats.lock); - - spin_lock(&ip_vs_stats.lock); - ip_vs_stats.conns++; - spin_unlock(&ip_vs_stats.lock); -} - - -static inline int -ip_vs_set_state(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - if (unlikely(!pp->state_transition)) - return 0; - return pp->state_transition(cp, direction, skb, pp); -} - - -/* - * IPVS persistent scheduling function - * It creates a connection entry according to its template if exists, - * or selects a server and creates a connection entry plus a template. - * Locking: we are svc user (svc->refcnt), so we hold all dests too - * Protocols supported: TCP, UDP - */ -static struct ip_vs_conn * -ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, - __be16 ports[2]) -{ - struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest; - struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ - __be32 snet; /* source network of the client, after masking */ - - /* Mask saddr with the netmask to adjust template granularity */ - snet = iph->saddr & svc->netmask; - - IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " - "mnet %u.%u.%u.%u\n", - NIPQUAD(iph->saddr), ntohs(ports[0]), - NIPQUAD(iph->daddr), ntohs(ports[1]), - NIPQUAD(snet)); - - /* - * As far as we know, FTP is a very complicated network protocol, and - * it uses control connection and data connections. For active FTP, - * FTP server initialize data connection to the client, its source port - * is often 20. For passive FTP, FTP server tells the clients the port - * that it passively listens to, and the client issues the data - * connection. In the tunneling or direct routing mode, the load - * balancer is on the client-to-server half of connection, the port - * number is unknown to the load balancer. So, a conn template like - * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP - * service, and a template like <caddr, 0, vaddr, vport, daddr, dport> - * is created for other persistent services. - */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, ports[1]); - else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like <protocol,caddr,0, - * vaddr,vport,daddr,dport> for non-ftp service, - * and <protocol,caddr,0,vaddr,0,daddr,0> - * for ftp service. - */ - if (svc->port != FTPPORT) - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, - ports[1], - dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = dest->port; - } else { - /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0> - * port zero template: <protocol,caddr,0,vaddr,0,daddr,0> - */ - if (svc->fwmark) - ct = ip_vs_ct_in_get(IPPROTO_IP, snet, 0, - htonl(svc->fwmark), 0); - else - ct = ip_vs_ct_in_get(iph->protocol, snet, 0, - iph->daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template according to the service - */ - if (svc->fwmark) - ct = ip_vs_conn_new(IPPROTO_IP, - snet, 0, - htonl(svc->fwmark), 0, - dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(iph->protocol, - snet, 0, - iph->daddr, 0, - dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = ports[1]; - } - - /* - * Create a new connection according to the template - */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, ports[0], - iph->daddr, ports[1], - dest->addr, dport, - 0, - dest); - if (cp == NULL) { - ip_vs_conn_put(ct); - return NULL; - } - - /* - * Add its control - */ - ip_vs_control_add(cp, ct); - ip_vs_conn_put(ct); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * IPVS main scheduling function - * It selects a server according to the virtual service, and - * creates a connection entry. - * Protocols supported: TCP, UDP - */ -struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_conn *cp = NULL; - struct iphdr *iph = ip_hdr(skb); - struct ip_vs_dest *dest; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - /* - * Persistent service - */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - return ip_vs_sched_persist(svc, skb, pptr); - - /* - * Non-persistent service - */ - if (!svc->fwmark && pptr[1] != svc->port) { - if (!svc->port) - IP_VS_ERR("Schedule: port zero only supported " - "in persistent services, " - "check your ipvs configuration\n"); - return NULL; - } - - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "Schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a connection entry. - */ - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], - dest->addr, dest->port?dest->port:pptr[1], - 0, - dest); - if (cp == NULL) - return NULL; - - IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", - ip_vs_fwd_tag(cp), - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - NIPQUAD(cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); - - ip_vs_conn_stats(cp, svc); - return cp; -} - - -/* - * Pass or drop the packet. - * Called by ip_vs_in, when the virtual service is available but - * no destination is available for a new connection. - */ -int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - __be16 _ports[2], *pptr; - struct iphdr *iph = ip_hdr(skb); - - pptr = skb_header_pointer(skb, iph->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) { - ip_vs_service_put(svc); - return NF_DROP; - } - - /* if it is fwmark-based service, the cache_bypass sysctl is up - and the destination is RTN_UNICAST (and not local), then create - a cache_bypass connection entry */ - if (sysctl_ip_vs_cache_bypass && svc->fwmark - && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) { - int ret, cs; - struct ip_vs_conn *cp; - - ip_vs_service_put(svc); - - /* create a new connection entry */ - IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); - cp = ip_vs_conn_new(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1], - 0, 0, - IP_VS_CONN_F_BYPASS, - NULL); - if (cp == NULL) - return NF_DROP; - - /* statistics */ - ip_vs_in_stats(cp, skb); - - /* set state */ - cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - - /* transmit the first SYN packet */ - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - - atomic_inc(&cp->in_pkts); - ip_vs_conn_put(cp); - return ret; - } - - /* - * When the virtual ftp service is presented, packets destined - * for other services on the VIP may get here (except services - * listed in the ipvs table), pass the packets, because it is - * not ipvs job to decide to drop the packets. - */ - if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { - ip_vs_service_put(svc); - return NF_ACCEPT; - } - - ip_vs_service_put(svc); - - /* - * Notify the client that the destination is unreachable, and - * release the socket buffer. - * Since it is in IP layer, the TCP socket is not actually - * created, the TCP RST packet cannot be sent, instead that - * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ - */ - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); - return NF_DROP; -} - - -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain, and is used for VS/NAT. - * It detects packets for VS/NAT connections and sends the packets - * immediately. This can avoid that iptable_nat mangles the packets - * for VS/NAT. - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - -__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) -{ - return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); -} - -static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) -{ - int err = ip_defrag(skb, user); - - if (!err) - ip_send_check(ip_hdr(skb)); - - return err; -} - -/* - * Packet has been made sufficiently writable in caller - * - inout: 1=in->out, 0=out->in - */ -void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, - struct ip_vs_conn *cp, int inout) -{ - struct iphdr *iph = ip_hdr(skb); - unsigned int icmp_offset = iph->ihl*4; - struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) + - icmp_offset); - struct iphdr *ciph = (struct iphdr *)(icmph + 1); - - if (inout) { - iph->saddr = cp->vaddr; - ip_send_check(iph); - ciph->daddr = cp->vaddr; - ip_send_check(ciph); - } else { - iph->daddr = cp->daddr; - ip_send_check(iph); - ciph->saddr = cp->daddr; - ip_send_check(ciph); - } - - /* the TCP/UDP port */ - if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { - __be16 *ports = (void *)ciph + ciph->ihl*4; - - if (inout) - ports[1] = cp->vport; - else - ports[0] = cp->dport; - } - - /* And finally the ICMP checksum */ - icmph->checksum = 0; - icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; - - if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMP"); - else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMP"); -} - -/* - * Handle ICMP messages in the inside-to-outside direction (outgoing). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - * (Only used in VS/NAT) - */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); - - offset += cih->ihl * 4; - - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_out_get(skb, pp, cih, offset, 1); - if (!cp) - return NF_ACCEPT; - - verdict = NF_DROP; - - if (IP_VS_FWD_METHOD(cp) != 0) { - IP_VS_ERR("shouldn't reach here, because the box is on the " - "half connection in the tun/dr module.\n"); - } - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - if (!skb_make_writable(skb, offset)) - goto out; - - ip_vs_nat_icmp(skb, pp, cp, 1); - - /* do the statistics and put it back */ - ip_vs_out_stats(cp, skb); - - skb->ipvs_property = 1; - verdict = NF_ACCEPT; - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -static inline int is_tcp_reset(const struct sk_buff *skb) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - return th->rst; -} - -/* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. - * Check if outgoing packet belongs to the established ip_vs_conn, - * rewrite addresses of the packet and send it on its way... - */ -static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct iphdr *iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ihl; - - EnterFunction(11); - - if (skb->ipvs_property) - return NF_ACCEPT; - - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); - - if (related) - return verdict; - iph = ip_hdr(skb); - } - - pp = ip_vs_proto_get(iph->protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - /* reassemble IP fragments */ - if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) && - !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) - return NF_STOLEN; - iph = ip_hdr(skb); - } - - ihl = iph->ihl << 2; - - /* - * Check if the packet belongs to an existing entry - */ - cp = pp->conn_out_get(skb, pp, iph, ihl, 0); - - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, ihl, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(iph->protocol, - iph->saddr, pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if (iph->protocol != IPPROTO_TCP - || !is_tcp_reset(skb)) { - icmp_send(skb,ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } - } - } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); - - if (!skb_make_writable(skb, ihl)) - goto drop; - - /* mangle the packet */ - if (pp->snat_handler && !pp->snat_handler(skb, pp, cp)) - goto drop; - ip_hdr(skb)->saddr = cp->vaddr; - ip_send_check(ip_hdr(skb)); - - /* For policy routing, packets originating from this - * machine itself may be routed differently to packets - * passing through. We want this packet to be routed as - * if it came from this machine itself. So re-compute - * the routing information. - */ - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); - - ip_vs_out_stats(cp, skb); - ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_conn_put(cp); - - skb->ipvs_property = 1; - - LeaveFunction(11); - return NF_ACCEPT; - - drop: - ip_vs_conn_put(cp); - kfree_skb(skb); - return NF_STOLEN; -} - - -/* - * Handle ICMP messages in the outside-to-inside direction (incoming). - * Find any that might be relevant, check against existing connections, - * forward to the right destination host if relevant. - * Currently handles error types - unreachable, quench, ttl exceeded. - */ -static int -ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) -{ - struct iphdr *iph; - struct icmphdr _icmph, *ic; - struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - unsigned int offset, ihl, verdict; - - *related = 1; - - /* reassemble IP fragments */ - if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) - return NF_STOLEN; - } - - iph = ip_hdr(skb); - offset = ihl = iph->ihl * 4; - ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); - if (ic == NULL) - return NF_DROP; - - IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", - ic->type, ntohs(icmp_id(ic)), - NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); - - /* - * Work through seeing if this is for us. - * These checks are supposed to be in an order that means easy - * things are checked first to speed up processing.... however - * this means that some packets will manage to get a long way - * down this stack and then be rejected, but that's life. - */ - if ((ic->type != ICMP_DEST_UNREACH) && - (ic->type != ICMP_SOURCE_QUENCH) && - (ic->type != ICMP_TIME_EXCEEDED)) { - *related = 0; - return NF_ACCEPT; - } - - /* Now find the contained IP header */ - offset += sizeof(_icmph); - cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); - if (cih == NULL) - return NF_ACCEPT; /* The packet looks wrong, ignore */ - - pp = ip_vs_proto_get(cih->protocol); - if (!pp) - return NF_ACCEPT; - - /* Is the embedded protocol header present? */ - if (unlikely(cih->frag_off & htons(IP_OFFSET) && - pp->dont_defrag)) - return NF_ACCEPT; - - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); - - offset += cih->ihl * 4; - - /* The embedded headers contain source and dest in reverse order */ - cp = pp->conn_in_get(skb, pp, cih, offset, 1); - if (!cp) - return NF_ACCEPT; - - verdict = NF_DROP; - - /* Ensure the checksum is correct */ - if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) { - /* Failed checksum! */ - IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", - NIPQUAD(iph->saddr)); - goto out; - } - - /* do the statistics and put it back */ - ip_vs_in_stats(cp, skb); - if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) - offset += 2 * sizeof(__u16); - verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ - - out: - __ip_vs_conn_put(cp); - - return verdict; -} - -/* - * Check if it's for virtual services, look it up, - * and send it on its way... - */ -static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct iphdr *iph; - struct ip_vs_protocol *pp; - struct ip_vs_conn *cp; - int ret, restart; - int ihl; - - /* - * Big tappo: only PACKET_HOST (neither loopback nor mcasts) - * ... don't know why 1st test DOES NOT include 2nd (?) - */ - if (unlikely(skb->pkt_type != PACKET_HOST - || skb->dev->flags & IFF_LOOPBACK || skb->sk)) { - IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", - skb->pkt_type, - ip_hdr(skb)->protocol, - NIPQUAD(ip_hdr(skb)->daddr)); - return NF_ACCEPT; - } - - iph = ip_hdr(skb); - if (unlikely(iph->protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); - - if (related) - return verdict; - iph = ip_hdr(skb); - } - - /* Protocol supported? */ - pp = ip_vs_proto_get(iph->protocol); - if (unlikely(!pp)) - return NF_ACCEPT; - - ihl = iph->ihl << 2; - - /* - * Check if the packet belongs to an existing connection entry - */ - cp = pp->conn_in_get(skb, pp, iph, ihl, 0); - - if (unlikely(!cp)) { - int v; - - if (!pp->conn_schedule(skb, pp, &v, &cp)) - return v; - } - - if (unlikely(!cp)) { - /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; - } - - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); - - /* Check the server status */ - if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { - /* the destination server is not available */ - - if (sysctl_ip_vs_expire_nodest_conn) { - /* try to expire the connection immediately */ - ip_vs_conn_expire_now(cp); - } - /* don't restart its timer, and silently - drop the packet. */ - __ip_vs_conn_put(cp); - return NF_DROP; - } - - ip_vs_in_stats(cp, skb); - restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); - if (cp->packet_xmit) - ret = cp->packet_xmit(skb, cp, pp); - /* do not touch skb anymore */ - else { - IP_VS_DBG_RL("warning: packet_xmit is null"); - ret = NF_ACCEPT; - } - - /* Increase its packet counter and check if it is needed - * to be synchronized - * - * Sync connection if it is about to close to - * encorage the standby servers to update the connections timeout - */ - atomic_inc(&cp->in_pkts); - if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && - (((cp->protocol != IPPROTO_TCP || - cp->state == IP_VS_TCP_S_ESTABLISHED) && - (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] - == sysctl_ip_vs_sync_threshold[0])) || - ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && - ((cp->state == IP_VS_TCP_S_FIN_WAIT) || - (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || - (cp->state == IP_VS_TCP_S_TIME_WAIT))))) - ip_vs_sync_conn(cp); - cp->old_state = cp->state; - - ip_vs_conn_put(cp); - return ret; -} - - -/* - * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP - * related packets destined for 0.0.0.0/0. - * When fwmark-based virtual service is used, such as transparent - * cache cluster, TCP packets can be marked and routed to ip_vs_in, - * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and - * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain - * and send them to ip_vs_in_icmp. - */ -static unsigned int -ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - int r; - - if (ip_hdr(skb)->protocol != IPPROTO_ICMP) - return NF_ACCEPT; - - return ip_vs_in_icmp(skb, &r, hooknum); -} - - -static struct nf_hook_ops ip_vs_ops[] __read_mostly = { - /* After packet filtering, forward packet through VS/DR, VS/TUN, - * or VS/NAT(change destination), so that filtering rules can be - * applied to IPVS. */ - { - .hook = ip_vs_in, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, - }, - /* After packet filtering, change source only for VS/NAT */ - { - .hook = ip_vs_out, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, - }, - /* After packet filtering (but before ip_vs_out_icmp), catch icmp - * destined for 0.0.0.0/0, which is for incoming IPVS connections */ - { - .hook = ip_vs_forward_icmp, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, - }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, -}; - - -/* - * Initialize IP Virtual Server - */ -static int __init ip_vs_init(void) -{ - int ret; - - ret = ip_vs_control_init(); - if (ret < 0) { - IP_VS_ERR("can't setup control.\n"); - goto cleanup_nothing; - } - - ip_vs_protocol_init(); - - ret = ip_vs_app_init(); - if (ret < 0) { - IP_VS_ERR("can't setup application helper.\n"); - goto cleanup_protocol; - } - - ret = ip_vs_conn_init(); - if (ret < 0) { - IP_VS_ERR("can't setup connection table.\n"); - goto cleanup_app; - } - - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) { - IP_VS_ERR("can't register hooks.\n"); - goto cleanup_conn; - } - - IP_VS_INFO("ipvs loaded.\n"); - return ret; - - cleanup_conn: - ip_vs_conn_cleanup(); - cleanup_app: - ip_vs_app_cleanup(); - cleanup_protocol: - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - cleanup_nothing: - return ret; -} - -static void __exit ip_vs_cleanup(void) -{ - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - ip_vs_conn_cleanup(); - ip_vs_app_cleanup(); - ip_vs_protocol_cleanup(); - ip_vs_control_cleanup(); - IP_VS_INFO("ipvs unloaded.\n"); -} - -module_init(ip_vs_init); -module_exit(ip_vs_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c deleted file mode 100644 index 6379705a8dcb..000000000000 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ /dev/null @@ -1,2373 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/types.h> -#include <linux/capability.h> -#include <linux/fs.h> -#include <linux/sysctl.h> -#include <linux/proc_fs.h> -#include <linux/workqueue.h> -#include <linux/swap.h> -#include <linux/seq_file.h> - -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/mutex.h> - -#include <net/net_namespace.h> -#include <net/ip.h> -#include <net/route.h> -#include <net/sock.h> - -#include <asm/uaccess.h> - -#include <net/ip_vs.h> - -/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ -static DEFINE_MUTEX(__ip_vs_mutex); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_svc_lock); - -/* lock for table with the real services */ -static DEFINE_RWLOCK(__ip_vs_rs_lock); - -/* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); - -/* lock for drop entry handling */ -static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); - -/* lock for drop packet handling */ -static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); - -/* 1/rate drop and drop-entry variables */ -int ip_vs_drop_rate = 0; -int ip_vs_drop_counter = 0; -static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); - -/* number of virtual services */ -static int ip_vs_num_services = 0; - -/* sysctl variables */ -static int sysctl_ip_vs_drop_entry = 0; -static int sysctl_ip_vs_drop_packet = 0; -static int sysctl_ip_vs_secure_tcp = 0; -static int sysctl_ip_vs_amemthresh = 1024; -static int sysctl_ip_vs_am_droprate = 10; -int sysctl_ip_vs_cache_bypass = 0; -int sysctl_ip_vs_expire_nodest_conn = 0; -int sysctl_ip_vs_expire_quiescent_template = 0; -int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; -int sysctl_ip_vs_nat_icmp_send = 0; - - -#ifdef CONFIG_IP_VS_DEBUG -static int sysctl_ip_vs_debug_level = 0; - -int ip_vs_get_debug_level(void) -{ - return sysctl_ip_vs_debug_level; -} -#endif - -/* - * update_defense_level is called from keventd and from sysctl, - * so it needs to protect itself from softirqs - */ -static void update_defense_level(void) -{ - struct sysinfo i; - static int old_secure_tcp = 0; - int availmem; - int nomem; - int to_change = -1; - - /* we only count free and buffered memory (in pages) */ - si_meminfo(&i); - availmem = i.freeram + i.bufferram; - /* however in linux 2.5 the i.bufferram is total page cache size, - we need adjust it */ - /* si_swapinfo(&i); */ - /* availmem = availmem - (i.totalswap - i.freeswap); */ - - nomem = (availmem < sysctl_ip_vs_amemthresh); - - local_bh_disable(); - - /* drop_entry */ - spin_lock(&__ip_vs_dropentry_lock); - switch (sysctl_ip_vs_drop_entry) { - case 0: - atomic_set(&ip_vs_dropentry, 0); - break; - case 1: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - sysctl_ip_vs_drop_entry = 2; - } else { - atomic_set(&ip_vs_dropentry, 0); - } - break; - case 2: - if (nomem) { - atomic_set(&ip_vs_dropentry, 1); - } else { - atomic_set(&ip_vs_dropentry, 0); - sysctl_ip_vs_drop_entry = 1; - }; - break; - case 3: - atomic_set(&ip_vs_dropentry, 1); - break; - } - spin_unlock(&__ip_vs_dropentry_lock); - - /* drop_packet */ - spin_lock(&__ip_vs_droppacket_lock); - switch (sysctl_ip_vs_drop_packet) { - case 0: - ip_vs_drop_rate = 0; - break; - case 1: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - sysctl_ip_vs_drop_packet = 2; - } else { - ip_vs_drop_rate = 0; - } - break; - case 2: - if (nomem) { - ip_vs_drop_rate = ip_vs_drop_counter - = sysctl_ip_vs_amemthresh / - (sysctl_ip_vs_amemthresh-availmem); - } else { - ip_vs_drop_rate = 0; - sysctl_ip_vs_drop_packet = 1; - } - break; - case 3: - ip_vs_drop_rate = sysctl_ip_vs_am_droprate; - break; - } - spin_unlock(&__ip_vs_droppacket_lock); - - /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); - switch (sysctl_ip_vs_secure_tcp) { - case 0: - if (old_secure_tcp >= 2) - to_change = 0; - break; - case 1: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - sysctl_ip_vs_secure_tcp = 2; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - } - break; - case 2: - if (nomem) { - if (old_secure_tcp < 2) - to_change = 1; - } else { - if (old_secure_tcp >= 2) - to_change = 0; - sysctl_ip_vs_secure_tcp = 1; - } - break; - case 3: - if (old_secure_tcp < 2) - to_change = 1; - break; - } - old_secure_tcp = sysctl_ip_vs_secure_tcp; - if (to_change >= 0) - ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); - - local_bh_enable(); -} - - -/* - * Timer for checking the defense - */ -#define DEFENSE_TIMER_PERIOD 1*HZ -static void defense_work_handler(struct work_struct *work); -static DECLARE_DELAYED_WORK(defense_work, defense_work_handler); - -static void defense_work_handler(struct work_struct *work) -{ - update_defense_level(); - if (atomic_read(&ip_vs_dropentry)) - ip_vs_random_dropentry(); - - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); -} - -int -ip_vs_use_count_inc(void) -{ - return try_module_get(THIS_MODULE); -} - -void -ip_vs_use_count_dec(void) -{ - module_put(THIS_MODULE); -} - - -/* - * Hash table: for virtual service lookups - */ -#define IP_VS_SVC_TAB_BITS 8 -#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) -#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) - -/* the service table hashed by <protocol, addr, port> */ -static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; -/* the service table hashed by fwmark */ -static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; - -/* - * Hash table: for real service lookups - */ -#define IP_VS_RTAB_BITS 4 -#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) -#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) - -static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; - -/* - * Trash for destinations - */ -static LIST_HEAD(ip_vs_dest_trash); - -/* - * FTP & NULL virtual service counters - */ -static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); -static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); - - -/* - * Returns hash value for virtual service - */ -static __inline__ unsigned -ip_vs_svc_hashkey(unsigned proto, __be32 addr, __be16 port) -{ - register unsigned porth = ntohs(port); - - return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) - & IP_VS_SVC_TAB_MASK; -} - -/* - * Returns hash value of fwmark for virtual service lookup - */ -static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) -{ - return fwmark & IP_VS_SVC_TAB_MASK; -} - -/* - * Hashes a service in the ip_vs_svc_table by <proto,addr,port> - * or in the ip_vs_svc_fwm_table by fwmark. - * Should be called with locked tables. - */ -static int ip_vs_svc_hash(struct ip_vs_service *svc) -{ - unsigned hash; - - if (svc->flags & IP_VS_SVC_F_HASHED) { - IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* - * Hash it by <protocol,addr,port> in ip_vs_svc_table - */ - hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); - list_add(&svc->s_list, &ip_vs_svc_table[hash]); - } else { - /* - * Hash it by fwmark in ip_vs_svc_fwm_table - */ - hash = ip_vs_svc_fwm_hashkey(svc->fwmark); - list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); - } - - svc->flags |= IP_VS_SVC_F_HASHED; - /* increase its refcnt because it is referenced by the svc table */ - atomic_inc(&svc->refcnt); - return 1; -} - - -/* - * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. - * Should be called with locked tables. - */ -static int ip_vs_svc_unhash(struct ip_vs_service *svc) -{ - if (!(svc->flags & IP_VS_SVC_F_HASHED)) { - IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - if (svc->fwmark == 0) { - /* Remove it from the ip_vs_svc_table table */ - list_del(&svc->s_list); - } else { - /* Remove it from the ip_vs_svc_fwm_table table */ - list_del(&svc->f_list); - } - - svc->flags &= ~IP_VS_SVC_F_HASHED; - atomic_dec(&svc->refcnt); - return 1; -} - - -/* - * Get service by {proto,addr,port} in the service table. - */ -static __inline__ struct ip_vs_service * -__ip_vs_service_get(__u16 protocol, __be32 vaddr, __be16 vport) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for "full" addressed entries */ - hash = ip_vs_svc_hashkey(protocol, vaddr, vport); - - list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ - if ((svc->addr == vaddr) - && (svc->port == vport) - && (svc->protocol == protocol)) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - - -/* - * Get service by {fwmark} in the service table. - */ -static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) -{ - unsigned hash; - struct ip_vs_service *svc; - - /* Check for fwmark addressed entries */ - hash = ip_vs_svc_fwm_hashkey(fwmark); - - list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { - if (svc->fwmark == fwmark) { - /* HIT */ - atomic_inc(&svc->usecnt); - return svc; - } - } - - return NULL; -} - -struct ip_vs_service * -ip_vs_service_get(__u32 fwmark, __u16 protocol, __be32 vaddr, __be16 vport) -{ - struct ip_vs_service *svc; - - read_lock(&__ip_vs_svc_lock); - - /* - * Check the table hashed by fwmark first - */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) - goto out; - - /* - * Check the table hashed by <protocol,addr,port> - * for "full" addressed entries - */ - svc = __ip_vs_service_get(protocol, vaddr, vport); - - if (svc == NULL - && protocol == IPPROTO_TCP - && atomic_read(&ip_vs_ftpsvc_counter) - && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { - /* - * Check if ftp service entry exists, the packet - * might belong to FTP data connections. - */ - svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); - } - - if (svc == NULL - && atomic_read(&ip_vs_nullsvc_counter)) { - /* - * Check if the catch-all port (port zero) exists - */ - svc = __ip_vs_service_get(protocol, vaddr, 0); - } - - out: - read_unlock(&__ip_vs_svc_lock); - - IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", - fwmark, ip_vs_proto_name(protocol), - NIPQUAD(vaddr), ntohs(vport), - svc?"hit":"not hit"); - - return svc; -} - - -static inline void -__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - atomic_inc(&svc->refcnt); - dest->svc = svc; -} - -static inline void -__ip_vs_unbind_svc(struct ip_vs_dest *dest) -{ - struct ip_vs_service *svc = dest->svc; - - dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) - kfree(svc); -} - - -/* - * Returns hash value for real service - */ -static __inline__ unsigned ip_vs_rs_hashkey(__be32 addr, __be16 port) -{ - register unsigned porth = ntohs(port); - - return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) - & IP_VS_RTAB_MASK; -} - -/* - * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. - * should be called with locked tables. - */ -static int ip_vs_rs_hash(struct ip_vs_dest *dest) -{ - unsigned hash; - - if (!list_empty(&dest->d_list)) { - return 0; - } - - /* - * Hash by proto,addr,port, - * which are the parameters of the real service. - */ - hash = ip_vs_rs_hashkey(dest->addr, dest->port); - list_add(&dest->d_list, &ip_vs_rtable[hash]); - - return 1; -} - -/* - * UNhashes ip_vs_dest from ip_vs_rtable. - * should be called with locked tables. - */ -static int ip_vs_rs_unhash(struct ip_vs_dest *dest) -{ - /* - * Remove it from the ip_vs_rtable table. - */ - if (!list_empty(&dest->d_list)) { - list_del(&dest->d_list); - INIT_LIST_HEAD(&dest->d_list); - } - - return 1; -} - -/* - * Lookup real service by <proto,addr,port> in the real service table. - */ -struct ip_vs_dest * -ip_vs_lookup_real_service(__u16 protocol, __be32 daddr, __be16 dport) -{ - unsigned hash; - struct ip_vs_dest *dest; - - /* - * Check for "full" addressed entries - * Return the first found entry - */ - hash = ip_vs_rs_hashkey(daddr, dport); - - read_lock(&__ip_vs_rs_lock); - list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { - if ((dest->addr == daddr) - && (dest->port == dport) - && ((dest->protocol == protocol) || - dest->vfwmark)) { - /* HIT */ - read_unlock(&__ip_vs_rs_lock); - return dest; - } - } - read_unlock(&__ip_vs_rs_lock); - - return NULL; -} - -/* - * Lookup destination by {addr,port} in the given service - */ -static struct ip_vs_dest * -ip_vs_lookup_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) -{ - struct ip_vs_dest *dest; - - /* - * Find the destination for the given service - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->addr == daddr) && (dest->port == dport)) { - /* HIT */ - return dest; - } - } - - return NULL; -} - -/* - * Find destination by {daddr,dport,vaddr,protocol} - * Cretaed to be used in ip_vs_process_message() in - * the backup synchronization daemon. It finds the - * destination to be bound to the received connection - * on the backup. - * - * ip_vs_lookup_real_service() looked promissing, but - * seems not working as expected. - */ -struct ip_vs_dest *ip_vs_find_dest(__be32 daddr, __be16 dport, - __be32 vaddr, __be16 vport, __u16 protocol) -{ - struct ip_vs_dest *dest; - struct ip_vs_service *svc; - - svc = ip_vs_service_get(0, protocol, vaddr, vport); - if (!svc) - return NULL; - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest) - atomic_inc(&dest->refcnt); - ip_vs_service_put(svc); - return dest; -} - -/* - * Lookup dest by {svc,addr,port} in the destination trash. - * The destination trash is used to hold the destinations that are removed - * from the service table but are still referenced by some conn entries. - * The reason to add the destination trash is when the dest is temporary - * down (either by administrator or by monitor program), the dest can be - * picked back from the trash, the remaining connections to the dest can - * continue, and the counting information of the dest is also useful for - * scheduling. - */ -static struct ip_vs_dest * -ip_vs_trash_get_dest(struct ip_vs_service *svc, __be32 daddr, __be16 dport) -{ - struct ip_vs_dest *dest, *nxt; - - /* - * Find the destination in trash - */ - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "dest->refcnt=%d\n", - dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); - if (dest->addr == daddr && - dest->port == dport && - dest->vfwmark == svc->fwmark && - dest->protocol == svc->protocol && - (svc->fwmark || - (dest->vaddr == svc->addr && - dest->vport == svc->port))) { - /* HIT */ - return dest; - } - - /* - * Try to purge the destination from trash if not referenced - */ - if (atomic_read(&dest->refcnt) == 1) { - IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " - "from trash\n", - dest->vfwmark, - NIPQUAD(dest->addr), ntohs(dest->port)); - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } - } - - return NULL; -} - - -/* - * Clean up all the destinations in the trash - * Called by the ip_vs_control_cleanup() - * - * When the ip_vs_control_clearup is activated by ipvs module exit, - * the service tables must have been flushed and all the connections - * are expired, and the refcnt of each destination in the trash must - * be 1, so we simply release them here. - */ -static void ip_vs_trash_cleanup(void) -{ - struct ip_vs_dest *dest, *nxt; - - list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { - list_del(&dest->n_list); - ip_vs_dst_reset(dest); - __ip_vs_unbind_svc(dest); - kfree(dest); - } -} - - -static void -ip_vs_zero_stats(struct ip_vs_stats *stats) -{ - spin_lock_bh(&stats->lock); - - stats->conns = 0; - stats->inpkts = 0; - stats->outpkts = 0; - stats->inbytes = 0; - stats->outbytes = 0; - - stats->cps = 0; - stats->inpps = 0; - stats->outpps = 0; - stats->inbps = 0; - stats->outbps = 0; - - ip_vs_zero_estimator(stats); - - spin_unlock_bh(&stats->lock); -} - -/* - * Update a destination in the given service - */ -static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) -{ - int conn_flags; - - /* set the weight and the flags */ - atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; - - /* check if local node and update the flags */ - if (inet_addr_type(&init_net, udest->addr) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { - conn_flags |= IP_VS_CONN_F_NOOUTPUT; - } else { - /* - * Put the real service in ip_vs_rtable if not present. - * For now only for NAT! - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_hash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - } - atomic_set(&dest->conn_flags, conn_flags); - - /* bind the service */ - if (!dest->svc) { - __ip_vs_bind_svc(dest, svc); - } else { - if (dest->svc != svc) { - __ip_vs_unbind_svc(dest); - ip_vs_zero_stats(&dest->stats); - __ip_vs_bind_svc(dest, svc); - } - } - - /* set the dest status flags */ - dest->flags |= IP_VS_DEST_F_AVAILABLE; - - if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) - dest->flags &= ~IP_VS_DEST_F_OVERLOAD; - dest->u_threshold = udest->u_threshold; - dest->l_threshold = udest->l_threshold; -} - - -/* - * Create a destination for the given service - */ -static int -ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, - struct ip_vs_dest **dest_p) -{ - struct ip_vs_dest *dest; - unsigned atype; - - EnterFunction(2); - - atype = inet_addr_type(&init_net, udest->addr); - if (atype != RTN_LOCAL && atype != RTN_UNICAST) - return -EINVAL; - - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); - if (dest == NULL) { - IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); - return -ENOMEM; - } - - dest->protocol = svc->protocol; - dest->vaddr = svc->addr; - dest->vport = svc->port; - dest->vfwmark = svc->fwmark; - dest->addr = udest->addr; - dest->port = udest->port; - - atomic_set(&dest->activeconns, 0); - atomic_set(&dest->inactconns, 0); - atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); - - INIT_LIST_HEAD(&dest->d_list); - spin_lock_init(&dest->dst_lock); - spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); - - *dest_p = dest; - - LeaveFunction(2); - return 0; -} - - -/* - * Add a destination into an existing service - */ -static int -ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) -{ - struct ip_vs_dest *dest; - __be32 daddr = udest->addr; - __be16 dport = udest->port; - int ret; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - /* - * Check if the dest already exists in the list - */ - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest != NULL) { - IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); - return -EEXIST; - } - - /* - * Check if the dest already exists in the trash and - * is from the same service - */ - dest = ip_vs_trash_get_dest(svc, daddr, dport); - if (dest != NULL) { - IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", - NIPQUAD(daddr), ntohs(dport), - atomic_read(&dest->refcnt), - dest->vfwmark, - NIPQUAD(dest->vaddr), - ntohs(dest->vport)); - __ip_vs_update_dest(svc, dest, udest); - - /* - * Get the destination from the trash - */ - list_del(&dest->n_list); - - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; - } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Edit a destination in the given service - */ -static int -ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) -{ - struct ip_vs_dest *dest; - __be32 daddr = udest->addr; - __be16 dport = udest->port; - - EnterFunction(2); - - if (udest->weight < 0) { - IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); - return -ERANGE; - } - - if (udest->l_threshold > udest->u_threshold) { - IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " - "upper threshold\n"); - return -ERANGE; - } - - /* - * Lookup the destination list - */ - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); - return -ENOENT; - } - - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - LeaveFunction(2); - - return 0; -} - - -/* - * Delete a destination (must be already unlinked from the service) - */ -static void __ip_vs_del_dest(struct ip_vs_dest *dest) -{ - ip_vs_kill_estimator(&dest->stats); - - /* - * Remove it from the d-linked list with the real services. - */ - write_lock_bh(&__ip_vs_rs_lock); - ip_vs_rs_unhash(dest); - write_unlock_bh(&__ip_vs_rs_lock); - - /* - * Decrease the refcnt of the dest, and free the dest - * if nobody refers to it (refcnt=0). Otherwise, throw - * the destination into the trash. - */ - if (atomic_dec_and_test(&dest->refcnt)) { - ip_vs_dst_reset(dest); - /* simply decrease svc->refcnt here, let the caller check - and release the service if nobody refers to it. - Only user context can release destination and service, - and only one user context can update virtual service at a - time, so the operation here is OK */ - atomic_dec(&dest->svc->refcnt); - kfree(dest); - } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " - "dest->refcnt=%d\n", - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); - list_add(&dest->n_list, &ip_vs_dest_trash); - atomic_inc(&dest->refcnt); - } -} - - -/* - * Unlink a destination from the given service - */ -static void __ip_vs_unlink_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, - int svcupd) -{ - dest->flags &= ~IP_VS_DEST_F_AVAILABLE; - - /* - * Remove it from the d-linked destination list. - */ - list_del(&dest->n_list); - svc->num_dests--; - if (svcupd) { - /* - * Call the update_service function of its scheduler - */ - svc->scheduler->update_service(svc); - } -} - - -/* - * Delete a destination server in the given service - */ -static int -ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) -{ - struct ip_vs_dest *dest; - __be32 daddr = udest->addr; - __be16 dport = udest->port; - - EnterFunction(2); - - dest = ip_vs_lookup_dest(svc, daddr, dport); - if (dest == NULL) { - IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); - return -ENOENT; - } - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Unlink dest from the service - */ - __ip_vs_unlink_dest(svc, dest, 1); - - write_unlock_bh(&__ip_vs_svc_lock); - - /* - * Delete the destination - */ - __ip_vs_del_dest(dest); - - LeaveFunction(2); - - return 0; -} - - -/* - * Add a service into the service hash table - */ -static int -ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) -{ - int ret = 0; - struct ip_vs_scheduler *sched = NULL; - struct ip_vs_service *svc = NULL; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - /* Lookup the scheduler by 'u->sched_name' */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - ret = -ENOENT; - goto out_mod_dec; - } - - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); - if (svc == NULL) { - IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); - ret = -ENOMEM; - goto out_err; - } - - /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); - atomic_set(&svc->refcnt, 0); - - svc->protocol = u->protocol; - svc->addr = u->addr; - svc->port = u->port; - svc->fwmark = u->fwmark; - svc->flags = u->flags; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - INIT_LIST_HEAD(&svc->destinations); - rwlock_init(&svc->sched_lock); - spin_lock_init(&svc->stats.lock); - - /* Bind the scheduler */ - ret = ip_vs_bind_scheduler(svc, sched); - if (ret) - goto out_err; - sched = NULL; - - /* Update the virtual service counters */ - if (svc->port == FTPPORT) - atomic_inc(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_inc(&ip_vs_nullsvc_counter); - - ip_vs_new_estimator(&svc->stats); - ip_vs_num_services++; - - /* Hash the service into the service table */ - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_hash(svc); - write_unlock_bh(&__ip_vs_svc_lock); - - *svc_p = svc; - return 0; - - out_err: - if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); - if (svc->inc) { - local_bh_disable(); - ip_vs_app_inc_put(svc->inc); - local_bh_enable(); - } - kfree(svc); - } - ip_vs_scheduler_put(sched); - - out_mod_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -/* - * Edit a service and bind it with a new scheduler - */ -static int -ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) -{ - struct ip_vs_scheduler *sched, *old_sched; - int ret = 0; - - /* - * Lookup the scheduler, by 'u->sched_name' - */ - sched = ip_vs_scheduler_get(u->sched_name); - if (sched == NULL) { - IP_VS_INFO("Scheduler module ip_vs_%s not found\n", - u->sched_name); - return -ENOENT; - } - old_sched = sched; - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* - * Set the flags and timeout value - */ - svc->flags = u->flags | IP_VS_SVC_F_HASHED; - svc->timeout = u->timeout * HZ; - svc->netmask = u->netmask; - - old_sched = svc->scheduler; - if (sched != old_sched) { - /* - * Unbind the old scheduler - */ - if ((ret = ip_vs_unbind_scheduler(svc))) { - old_sched = sched; - goto out; - } - - /* - * Bind the new scheduler - */ - if ((ret = ip_vs_bind_scheduler(svc, sched))) { - /* - * If ip_vs_bind_scheduler fails, restore the old - * scheduler. - * The main reason of failure is out of memory. - * - * The question is if the old scheduler can be - * restored all the time. TODO: if it cannot be - * restored some time, we must delete the service, - * otherwise the system may crash. - */ - ip_vs_bind_scheduler(svc, old_sched); - old_sched = sched; - goto out; - } - } - - out: - write_unlock_bh(&__ip_vs_svc_lock); - - if (old_sched) - ip_vs_scheduler_put(old_sched); - - return ret; -} - - -/* - * Delete a service from the service list - * - The service must be unlinked, unlocked and not referenced! - * - We are called under _bh lock - */ -static void __ip_vs_del_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest, *nxt; - struct ip_vs_scheduler *old_sched; - - ip_vs_num_services--; - ip_vs_kill_estimator(&svc->stats); - - /* Unbind scheduler */ - old_sched = svc->scheduler; - ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); - - /* Unbind app inc */ - if (svc->inc) { - ip_vs_app_inc_put(svc->inc); - svc->inc = NULL; - } - - /* - * Unlink the whole destination list - */ - list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { - __ip_vs_unlink_dest(svc, dest, 0); - __ip_vs_del_dest(dest); - } - - /* - * Update the virtual service counters - */ - if (svc->port == FTPPORT) - atomic_dec(&ip_vs_ftpsvc_counter); - else if (svc->port == 0) - atomic_dec(&ip_vs_nullsvc_counter); - - /* - * Free the service if nobody refers to it - */ - if (atomic_read(&svc->refcnt) == 0) - kfree(svc); - - /* decrease the module use count */ - ip_vs_use_count_dec(); -} - -/* - * Delete a service from the service list - */ -static int ip_vs_del_service(struct ip_vs_service *svc) -{ - if (svc == NULL) - return -EEXIST; - - /* - * Unhash it from the service table - */ - write_lock_bh(&__ip_vs_svc_lock); - - ip_vs_svc_unhash(svc); - - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - __ip_vs_del_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - - return 0; -} - - -/* - * Flush all the virtual services - */ -static int ip_vs_flush(void) -{ - int idx; - struct ip_vs_service *svc, *nxt; - - /* - * Flush the service table hashed by <protocol,addr,port> - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - /* - * Flush the service table hashed by fwmark - */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry_safe(svc, nxt, - &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); - } - } - - return 0; -} - - -/* - * Zero counters in a service or all services - */ -static int ip_vs_zero_service(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - - write_lock_bh(&__ip_vs_svc_lock); - list_for_each_entry(dest, &svc->destinations, n_list) { - ip_vs_zero_stats(&dest->stats); - } - ip_vs_zero_stats(&svc->stats); - write_unlock_bh(&__ip_vs_svc_lock); - return 0; -} - -static int ip_vs_zero_all(void) -{ - int idx; - struct ip_vs_service *svc; - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - ip_vs_zero_service(svc); - } - } - - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - ip_vs_zero_service(svc); - } - } - - ip_vs_zero_stats(&ip_vs_stats); - return 0; -} - - -static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val = *valp; - int rc; - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (*valp != val)) { - if ((*valp < 0) || (*valp > 3)) { - /* Restore the correct value */ - *valp = val; - } else { - update_defense_level(); - } - } - return rc; -} - - -static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int *valp = table->data; - int val[2]; - int rc; - - /* backup the value first */ - memcpy(val, valp, sizeof(val)); - - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); - if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { - /* Restore the correct value */ - memcpy(valp, val, sizeof(val)); - } - return rc; -} - - -/* - * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) - */ - -static struct ctl_table vs_vars[] = { - { - .procname = "amemthresh", - .data = &sysctl_ip_vs_amemthresh, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#ifdef CONFIG_IP_VS_DEBUG - { - .procname = "debug_level", - .data = &sysctl_ip_vs_debug_level, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif - { - .procname = "am_droprate", - .data = &sysctl_ip_vs_am_droprate, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "drop_entry", - .data = &sysctl_ip_vs_drop_entry, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "drop_packet", - .data = &sysctl_ip_vs_drop_packet, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, - { - .procname = "secure_tcp", - .data = &sysctl_ip_vs_secure_tcp, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_do_defense_mode, - }, -#if 0 - { - .procname = "timeout_established", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synsent", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synrecv", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_finwait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_timewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_close", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_closewait", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_lastack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_listen", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_synack", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_udp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { - .procname = "timeout_icmp", - .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, -#endif - { - .procname = "cache_bypass", - .data = &sysctl_ip_vs_cache_bypass, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_nodest_conn", - .data = &sysctl_ip_vs_expire_nodest_conn, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "expire_quiescent_template", - .data = &sysctl_ip_vs_expire_quiescent_template, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .procname = "sync_threshold", - .data = &sysctl_ip_vs_sync_threshold, - .maxlen = sizeof(sysctl_ip_vs_sync_threshold), - .mode = 0644, - .proc_handler = &proc_do_sync_threshold, - }, - { - .procname = "nat_icmp_send", - .data = &sysctl_ip_vs_nat_icmp_send, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { .ctl_name = 0 } -}; - -const struct ctl_path net_vs_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "vs", }, - { } -}; -EXPORT_SYMBOL_GPL(net_vs_ctl_path); - -static struct ctl_table_header * sysctl_header; - -#ifdef CONFIG_PROC_FS - -struct ip_vs_iter { - struct list_head *table; - int bucket; -}; - -/* - * Write the contents of the VS rule table to a PROCfs file. - * (It is kept just for backward compatibility) - */ -static inline const char *ip_vs_fwd_name(unsigned flags) -{ - switch (flags & IP_VS_CONN_F_FWD_MASK) { - case IP_VS_CONN_F_LOCALNODE: - return "Local"; - case IP_VS_CONN_F_TUNNEL: - return "Tunnel"; - case IP_VS_CONN_F_DROUTE: - return "Route"; - default: - return "Masq"; - } -} - - -/* Get the Nth entry in the two lists */ -static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) -{ - struct ip_vs_iter *iter = seq->private; - int idx; - struct ip_vs_service *svc; - - /* look in hash by protocol */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (pos-- == 0){ - iter->table = ip_vs_svc_table; - iter->bucket = idx; - return svc; - } - } - } - - /* keep looking in fwmark */ - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (pos-- == 0) { - iter->table = ip_vs_svc_fwm_table; - iter->bucket = idx; - return svc; - } - } - } - - return NULL; -} - -static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) -{ - - read_lock_bh(&__ip_vs_svc_lock); - return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; -} - - -static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct list_head *e; - struct ip_vs_iter *iter; - struct ip_vs_service *svc; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ip_vs_info_array(seq,0); - - svc = v; - iter = seq->private; - - if (iter->table == ip_vs_svc_table) { - /* next service in table hashed by protocol */ - if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, s_list); - - - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], - s_list) { - return svc; - } - } - - iter->table = ip_vs_svc_fwm_table; - iter->bucket = -1; - goto scan_fwmark; - } - - /* next service in hashed by fwmark */ - if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) - return list_entry(e, struct ip_vs_service, f_list); - - scan_fwmark: - while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], - f_list) - return svc; - } - - return NULL; -} - -static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) -{ - read_unlock_bh(&__ip_vs_svc_lock); -} - - -static int ip_vs_info_seq_show(struct seq_file *seq, void *v) -{ - if (v == SEQ_START_TOKEN) { - seq_printf(seq, - "IP Virtual Server version %d.%d.%d (size=%d)\n", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - seq_puts(seq, - "Prot LocalAddress:Port Scheduler Flags\n"); - seq_puts(seq, - " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); - } else { - const struct ip_vs_service *svc = v; - const struct ip_vs_iter *iter = seq->private; - const struct ip_vs_dest *dest; - - if (iter->table == ip_vs_svc_table) - seq_printf(seq, "%s %08X:%04X %s ", - ip_vs_proto_name(svc->protocol), - ntohl(svc->addr), - ntohs(svc->port), - svc->scheduler->name); - else - seq_printf(seq, "FWM %08X %s ", - svc->fwmark, svc->scheduler->name); - - if (svc->flags & IP_VS_SVC_F_PERSISTENT) - seq_printf(seq, "persistent %d %08X\n", - svc->timeout, - ntohl(svc->netmask)); - else - seq_putc(seq, '\n'); - - list_for_each_entry(dest, &svc->destinations, n_list) { - seq_printf(seq, - " -> %08X:%04X %-7s %-6d %-10d %-10d\n", - ntohl(dest->addr), ntohs(dest->port), - ip_vs_fwd_name(atomic_read(&dest->conn_flags)), - atomic_read(&dest->weight), - atomic_read(&dest->activeconns), - atomic_read(&dest->inactconns)); - } - } - return 0; -} - -static const struct seq_operations ip_vs_info_seq_ops = { - .start = ip_vs_info_seq_start, - .next = ip_vs_info_seq_next, - .stop = ip_vs_info_seq_stop, - .show = ip_vs_info_seq_show, -}; - -static int ip_vs_info_open(struct inode *inode, struct file *file) -{ - return seq_open_private(file, &ip_vs_info_seq_ops, - sizeof(struct ip_vs_iter)); -} - -static const struct file_operations ip_vs_info_fops = { - .owner = THIS_MODULE, - .open = ip_vs_info_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release_private, -}; - -#endif - -struct ip_vs_stats ip_vs_stats = { - .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock), -}; - -#ifdef CONFIG_PROC_FS -static int ip_vs_stats_show(struct seq_file *seq, void *v) -{ - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - " Conns Packets Packets Bytes Bytes\n"); - - spin_lock_bh(&ip_vs_stats.lock); - seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, - ip_vs_stats.inpkts, ip_vs_stats.outpkts, - (unsigned long long) ip_vs_stats.inbytes, - (unsigned long long) ip_vs_stats.outbytes); - -/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ - seq_puts(seq, - " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); - seq_printf(seq,"%8X %8X %8X %16X %16X\n", - ip_vs_stats.cps, - ip_vs_stats.inpps, - ip_vs_stats.outpps, - ip_vs_stats.inbps, - ip_vs_stats.outbps); - spin_unlock_bh(&ip_vs_stats.lock); - - return 0; -} - -static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) -{ - return single_open(file, ip_vs_stats_show, NULL); -} - -static const struct file_operations ip_vs_stats_fops = { - .owner = THIS_MODULE, - .open = ip_vs_stats_seq_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -#endif - -/* - * Set timeout values for tcp tcpfin udp in the timeout_table. - */ -static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) -{ - IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", - u->tcp_timeout, - u->tcp_fin_timeout, - u->udp_timeout); - -#ifdef CONFIG_IP_VS_PROTO_TCP - if (u->tcp_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] - = u->tcp_timeout * HZ; - } - - if (u->tcp_fin_timeout) { - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] - = u->tcp_fin_timeout * HZ; - } -#endif - -#ifdef CONFIG_IP_VS_PROTO_UDP - if (u->udp_timeout) { - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] - = u->udp_timeout * HZ; - } -#endif - return 0; -} - - -#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) -#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ - sizeof(struct ip_vs_dest_user)) -#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) -#define MAX_ARG_LEN SVCDEST_ARG_LEN - -static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { - [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, - [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, - [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, -}; - -static int -do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) -{ - int ret; - unsigned char arg[MAX_ARG_LEN]; - struct ip_vs_service_user *usvc; - struct ip_vs_service *svc; - struct ip_vs_dest_user *udest; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (len != set_arglen[SET_CMDID(cmd)]) { - IP_VS_ERR("set_ctl: len %u != %u\n", - len, set_arglen[SET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, len) != 0) - return -EFAULT; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - if (mutex_lock_interruptible(&__ip_vs_mutex)) { - ret = -ERESTARTSYS; - goto out_dec; - } - - if (cmd == IP_VS_SO_SET_FLUSH) { - /* Flush the virtual service */ - ret = ip_vs_flush(); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_TIMEOUT) { - /* Set timeout values for (tcp tcpfin udp) */ - ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); - goto out_unlock; - } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { - struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; - ret = stop_sync_thread(dm->state); - goto out_unlock; - } - - usvc = (struct ip_vs_service_user *)arg; - udest = (struct ip_vs_dest_user *)(usvc + 1); - - if (cmd == IP_VS_SO_SET_ZERO) { - /* if no service address is set, zero counters in all */ - if (!usvc->fwmark && !usvc->addr && !usvc->port) { - ret = ip_vs_zero_all(); - goto out_unlock; - } - } - - /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ - if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { - IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", - usvc->protocol, NIPQUAD(usvc->addr), - ntohs(usvc->port), usvc->sched_name); - ret = -EFAULT; - goto out_unlock; - } - - /* Lookup the exact service by <protocol, addr, port> or fwmark */ - if (usvc->fwmark == 0) - svc = __ip_vs_service_get(usvc->protocol, - usvc->addr, usvc->port); - else - svc = __ip_vs_svc_fwm_get(usvc->fwmark); - - if (cmd != IP_VS_SO_SET_ADD - && (svc == NULL || svc->protocol != usvc->protocol)) { - ret = -ESRCH; - goto out_unlock; - } - - switch (cmd) { - case IP_VS_SO_SET_ADD: - if (svc != NULL) - ret = -EEXIST; - else - ret = ip_vs_add_service(usvc, &svc); - break; - case IP_VS_SO_SET_EDIT: - ret = ip_vs_edit_service(svc, usvc); - break; - case IP_VS_SO_SET_DEL: - ret = ip_vs_del_service(svc); - if (!ret) - goto out_unlock; - break; - case IP_VS_SO_SET_ZERO: - ret = ip_vs_zero_service(svc); - break; - case IP_VS_SO_SET_ADDDEST: - ret = ip_vs_add_dest(svc, udest); - break; - case IP_VS_SO_SET_EDITDEST: - ret = ip_vs_edit_dest(svc, udest); - break; - case IP_VS_SO_SET_DELDEST: - ret = ip_vs_del_dest(svc, udest); - break; - default: - ret = -EINVAL; - } - - if (svc) - ip_vs_service_put(svc); - - out_unlock: - mutex_unlock(&__ip_vs_mutex); - out_dec: - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return ret; -} - - -static void -ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) -{ - spin_lock_bh(&src->lock); - memcpy(dst, src, (char*)&src->lock - (char*)src); - spin_unlock_bh(&src->lock); -} - -static void -ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) -{ - dst->protocol = src->protocol; - dst->addr = src->addr; - dst->port = src->port; - dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name)); - dst->flags = src->flags; - dst->timeout = src->timeout / HZ; - dst->netmask = src->netmask; - dst->num_dests = src->num_dests; - ip_vs_copy_stats(&dst->stats, &src->stats); -} - -static inline int -__ip_vs_get_service_entries(const struct ip_vs_get_services *get, - struct ip_vs_get_services __user *uptr) -{ - int idx, count=0; - struct ip_vs_service *svc; - struct ip_vs_service_entry entry; - int ret = 0; - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - - for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { - if (count >= get->num_services) - goto out; - memset(&entry, 0, sizeof(entry)); - ip_vs_copy_service(&entry, svc); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - goto out; - } - count++; - } - } - out: - return ret; -} - -static inline int -__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, - struct ip_vs_get_dests __user *uptr) -{ - struct ip_vs_service *svc; - int ret = 0; - - if (get->fwmark) - svc = __ip_vs_svc_fwm_get(get->fwmark); - else - svc = __ip_vs_service_get(get->protocol, - get->addr, get->port); - if (svc) { - int count = 0; - struct ip_vs_dest *dest; - struct ip_vs_dest_entry entry; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (count >= get->num_dests) - break; - - entry.addr = dest->addr; - entry.port = dest->port; - entry.conn_flags = atomic_read(&dest->conn_flags); - entry.weight = atomic_read(&dest->weight); - entry.u_threshold = dest->u_threshold; - entry.l_threshold = dest->l_threshold; - entry.activeconns = atomic_read(&dest->activeconns); - entry.inactconns = atomic_read(&dest->inactconns); - entry.persistconns = atomic_read(&dest->persistconns); - ip_vs_copy_stats(&entry.stats, &dest->stats); - if (copy_to_user(&uptr->entrytable[count], - &entry, sizeof(entry))) { - ret = -EFAULT; - break; - } - count++; - } - ip_vs_service_put(svc); - } else - ret = -ESRCH; - return ret; -} - -static inline void -__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) -{ -#ifdef CONFIG_IP_VS_PROTO_TCP - u->tcp_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; - u->tcp_fin_timeout = - ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - u->udp_timeout = - ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; -#endif -} - - -#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) -#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) -#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) -#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) -#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) -#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) -#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) - -static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { - [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, - [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, - [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, -}; - -static int -do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) -{ - unsigned char arg[128]; - int ret = 0; - - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - - if (*len < get_arglen[GET_CMDID(cmd)]) { - IP_VS_ERR("get_ctl: len %u < %u\n", - *len, get_arglen[GET_CMDID(cmd)]); - return -EINVAL; - } - - if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) - return -EFAULT; - - if (mutex_lock_interruptible(&__ip_vs_mutex)) - return -ERESTARTSYS; - - switch (cmd) { - case IP_VS_SO_GET_VERSION: - { - char buf[64]; - - sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", - NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); - if (copy_to_user(user, buf, strlen(buf)+1) != 0) { - ret = -EFAULT; - goto out; - } - *len = strlen(buf)+1; - } - break; - - case IP_VS_SO_GET_INFO: - { - struct ip_vs_getinfo info; - info.version = IP_VS_VERSION_CODE; - info.size = IP_VS_CONN_TAB_SIZE; - info.num_services = ip_vs_num_services; - if (copy_to_user(user, &info, sizeof(info)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_SERVICES: - { - struct ip_vs_get_services *get; - int size; - - get = (struct ip_vs_get_services *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_service_entry) * get->num_services; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_service_entries(get, user); - } - break; - - case IP_VS_SO_GET_SERVICE: - { - struct ip_vs_service_entry *entry; - struct ip_vs_service *svc; - - entry = (struct ip_vs_service_entry *)arg; - if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(entry->fwmark); - else - svc = __ip_vs_service_get(entry->protocol, - entry->addr, entry->port); - if (svc) { - ip_vs_copy_service(entry, svc); - if (copy_to_user(user, entry, sizeof(*entry)) != 0) - ret = -EFAULT; - ip_vs_service_put(svc); - } else - ret = -ESRCH; - } - break; - - case IP_VS_SO_GET_DESTS: - { - struct ip_vs_get_dests *get; - int size; - - get = (struct ip_vs_get_dests *)arg; - size = sizeof(*get) + - sizeof(struct ip_vs_dest_entry) * get->num_dests; - if (*len != size) { - IP_VS_ERR("length: %u != %u\n", *len, size); - ret = -EINVAL; - goto out; - } - ret = __ip_vs_get_dest_entries(get, user); - } - break; - - case IP_VS_SO_GET_TIMEOUT: - { - struct ip_vs_timeout_user t; - - __ip_vs_get_timeouts(&t); - if (copy_to_user(user, &t, sizeof(t)) != 0) - ret = -EFAULT; - } - break; - - case IP_VS_SO_GET_DAEMON: - { - struct ip_vs_daemon_user d[2]; - - memset(&d, 0, sizeof(d)); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) { - d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); - d[0].syncid = ip_vs_master_syncid; - } - if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { - d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); - d[1].syncid = ip_vs_backup_syncid; - } - if (copy_to_user(user, &d, sizeof(d)) != 0) - ret = -EFAULT; - } - break; - - default: - ret = -EINVAL; - } - - out: - mutex_unlock(&__ip_vs_mutex); - return ret; -} - - -static struct nf_sockopt_ops ip_vs_sockopts = { - .pf = PF_INET, - .set_optmin = IP_VS_BASE_CTL, - .set_optmax = IP_VS_SO_SET_MAX+1, - .set = do_ip_vs_set_ctl, - .get_optmin = IP_VS_BASE_CTL, - .get_optmax = IP_VS_SO_GET_MAX+1, - .get = do_ip_vs_get_ctl, - .owner = THIS_MODULE, -}; - - -int __init ip_vs_control_init(void) -{ - int ret; - int idx; - - EnterFunction(2); - - ret = nf_register_sockopt(&ip_vs_sockopts); - if (ret) { - IP_VS_ERR("cannot register sockopt.\n"); - return ret; - } - - proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops); - proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops); - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } - - ip_vs_new_estimator(&ip_vs_stats); - - /* Hook the defense timer */ - schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); - - LeaveFunction(2); - return 0; -} - - -void ip_vs_control_cleanup(void) -{ - EnterFunction(2); - ip_vs_trash_cleanup(); - cancel_rearming_delayed_work(&defense_work); - cancel_work_sync(&defense_work.work); - ip_vs_kill_estimator(&ip_vs_stats); - unregister_sysctl_table(sysctl_header); - proc_net_remove(&init_net, "ip_vs_stats"); - proc_net_remove(&init_net, "ip_vs"); - nf_unregister_sockopt(&ip_vs_sockopts); - LeaveFunction(2); -} diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c deleted file mode 100644 index fa66824d264f..000000000000 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ /dev/null @@ -1,258 +0,0 @@ -/* - * IPVS: Destination Hashing scheduling module - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * Inspired by the consistent hashing scheduler patch from - * Thomas Proell <proellt@gmx.de> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The dh algorithm is to select server by the hash key of destination IP - * address. The pseudo code is as follows: - * - * n <- servernode[dest_ip]; - * if (n is dead) OR - * (n is overloaded) OR (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet destination IP address to the current server - * array. If the dh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include <net/ip_vs.h> - - -/* - * IPVS DH bucket - */ -struct ip_vs_dh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS DH entry hash table - */ -#ifndef CONFIG_IP_VS_DH_TAB_BITS -#define CONFIG_IP_VS_DH_TAB_BITS 8 -#endif -#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS -#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) -#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS DH entry - */ -static inline unsigned ip_vs_dh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_dh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_dh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) -{ - int i; - struct ip_vs_dh_bucket *b; - - b = tbl; - for (i=0; i<IP_VS_DH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_dh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl; - - /* allocate the DH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_dh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_dh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_dh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_dh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_dh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Destination hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_dh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_dh_bucket *)svc->sched_data; - dest = ip_vs_dh_get(tbl, iph->daddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->daddr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS DH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_dh_scheduler = -{ - .name = "dh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_dh_scheduler.n_list), - .init_service = ip_vs_dh_init_svc, - .done_service = ip_vs_dh_done_svc, - .update_service = ip_vs_dh_update_svc, - .schedule = ip_vs_dh_schedule, -}; - - -static int __init ip_vs_dh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -static void __exit ip_vs_dh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); -} - - -module_init(ip_vs_dh_init); -module_exit(ip_vs_dh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c deleted file mode 100644 index 5a20f93bd7f9..000000000000 --- a/net/ipv4/ipvs/ip_vs_est.c +++ /dev/null @@ -1,162 +0,0 @@ -/* - * ip_vs_est.c: simple rate estimator for IPVS - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ -#include <linux/kernel.h> -#include <linux/jiffies.h> -#include <linux/slab.h> -#include <linux/types.h> -#include <linux/interrupt.h> -#include <linux/sysctl.h> -#include <linux/list.h> - -#include <net/ip_vs.h> - -/* - This code is to estimate rate in a shorter interval (such as 8 - seconds) for virtual services and real servers. For measure rate in a - long interval, it is easy to implement a user level daemon which - periodically reads those statistical counters and measure rate. - - Currently, the measurement is activated by slow timer handler. Hope - this measurement will not introduce too much load. - - We measure rate during the last 8 seconds every 2 seconds: - - avgrate = avgrate*(1-W) + rate*W - - where W = 2^(-2) - - NOTES. - - * The stored value for average bps is scaled by 2^5, so that maximal - rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. - - * A lot code is taken from net/sched/estimator.c - */ - - -static void estimation_timer(unsigned long arg); - -static LIST_HEAD(est_list); -static DEFINE_SPINLOCK(est_lock); -static DEFINE_TIMER(est_timer, estimation_timer, 0, 0); - -static void estimation_timer(unsigned long arg) -{ - struct ip_vs_estimator *e; - struct ip_vs_stats *s; - u32 n_conns; - u32 n_inpkts, n_outpkts; - u64 n_inbytes, n_outbytes; - u32 rate; - - spin_lock(&est_lock); - list_for_each_entry(e, &est_list, list) { - s = container_of(e, struct ip_vs_stats, est); - - spin_lock(&s->lock); - n_conns = s->conns; - n_inpkts = s->inpkts; - n_outpkts = s->outpkts; - n_inbytes = s->inbytes; - n_outbytes = s->outbytes; - - /* scaled by 2^10, but divided 2 seconds */ - rate = (n_conns - e->last_conns)<<9; - e->last_conns = n_conns; - e->cps += ((long)rate - (long)e->cps)>>2; - s->cps = (e->cps+0x1FF)>>10; - - rate = (n_inpkts - e->last_inpkts)<<9; - e->last_inpkts = n_inpkts; - e->inpps += ((long)rate - (long)e->inpps)>>2; - s->inpps = (e->inpps+0x1FF)>>10; - - rate = (n_outpkts - e->last_outpkts)<<9; - e->last_outpkts = n_outpkts; - e->outpps += ((long)rate - (long)e->outpps)>>2; - s->outpps = (e->outpps+0x1FF)>>10; - - rate = (n_inbytes - e->last_inbytes)<<4; - e->last_inbytes = n_inbytes; - e->inbps += ((long)rate - (long)e->inbps)>>2; - s->inbps = (e->inbps+0xF)>>5; - - rate = (n_outbytes - e->last_outbytes)<<4; - e->last_outbytes = n_outbytes; - e->outbps += ((long)rate - (long)e->outbps)>>2; - s->outbps = (e->outbps+0xF)>>5; - spin_unlock(&s->lock); - } - spin_unlock(&est_lock); - mod_timer(&est_timer, jiffies + 2*HZ); -} - -void ip_vs_new_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - INIT_LIST_HEAD(&est->list); - - est->last_conns = stats->conns; - est->cps = stats->cps<<10; - - est->last_inpkts = stats->inpkts; - est->inpps = stats->inpps<<10; - - est->last_outpkts = stats->outpkts; - est->outpps = stats->outpps<<10; - - est->last_inbytes = stats->inbytes; - est->inbps = stats->inbps<<5; - - est->last_outbytes = stats->outbytes; - est->outbps = stats->outbps<<5; - - spin_lock_bh(&est_lock); - if (list_empty(&est_list)) - mod_timer(&est_timer, jiffies + 2 * HZ); - list_add(&est->list, &est_list); - spin_unlock_bh(&est_lock); -} - -void ip_vs_kill_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - spin_lock_bh(&est_lock); - list_del(&est->list); - while (list_empty(&est_list) && try_to_del_timer_sync(&est_timer) < 0) { - spin_unlock_bh(&est_lock); - cpu_relax(); - spin_lock_bh(&est_lock); - } - spin_unlock_bh(&est_lock); -} - -void ip_vs_zero_estimator(struct ip_vs_stats *stats) -{ - struct ip_vs_estimator *est = &stats->est; - - /* set counters zero, caller must hold the stats->lock lock */ - est->last_inbytes = 0; - est->last_outbytes = 0; - est->last_conns = 0; - est->last_inpkts = 0; - est->last_outpkts = 0; - est->cps = 0; - est->inpps = 0; - est->outpps = 0; - est->inbps = 0; - est->outbps = 0; -} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c deleted file mode 100644 index c1c758e4f733..000000000000 --- a/net/ipv4/ipvs/ip_vs_ftp.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * ip_vs_ftp.c: IPVS ftp application module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * Changes: - * - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference - * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. - * - * IP_MASQ_FTP ftp masquerading module - * - * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 - * - * Author: Wouter Gadeyne - * - */ - -#include <linux/module.h> -#include <linux/moduleparam.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/netfilter.h> -#include <net/protocol.h> -#include <net/tcp.h> -#include <asm/unaligned.h> - -#include <net/ip_vs.h> - - -#define SERVER_STRING "227 Entering Passive Mode (" -#define CLIENT_STRING "PORT " - - -/* - * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper - * First port is set to the default port. - */ -static unsigned short ports[IP_VS_APP_MAX_PORTS] = {21, 0}; -module_param_array(ports, ushort, NULL, 0); -MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); - - -/* Dummy variable */ -static int ip_vs_ftp_pasv; - - -static int -ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -static int -ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) -{ - return 0; -} - - -/* - * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern" and terminated with the "term" character. - * <addr,port> is in network order. - */ -static int ip_vs_ftp_get_addrport(char *data, char *data_limit, - const char *pattern, size_t plen, char term, - __be32 *addr, __be16 *port, - char **start, char **end) -{ - unsigned char p[6]; - int i = 0; - - if (data_limit - data < plen) { - /* check if there is partial match */ - if (strnicmp(data, pattern, data_limit - data) == 0) - return -1; - else - return 0; - } - - if (strnicmp(data, pattern, plen) != 0) { - return 0; - } - *start = data + plen; - - for (data = *start; *data != term; data++) { - if (data == data_limit) - return -1; - } - *end = data; - - memset(p, 0, sizeof(p)); - for (data = *start; data != *end; data++) { - if (*data >= '0' && *data <= '9') { - p[i] = p[i]*10 + *data - '0'; - } else if (*data == ',' && i < 5) { - i++; - } else { - /* unexpected character */ - return -1; - } - } - - if (i != 5) - return -1; - - *addr = get_unaligned((__be32 *)p); - *port = get_unaligned((__be16 *)(p + 4)); - return 1; -} - - -/* - * Look at outgoing ftp packets to catch the response to a PASV command - * from the server (inside-to-outside). - * When we see one, we build a connection entry with the client address, - * client port 0 (unknown at the moment), the server address and the - * server port. Mark the current connection entry as a control channel - * of the new entry. All this work is just to make the data connection - * can be scheduled to the right server later. - * - * The outgoing packet should be something like - * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". - * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. - */ -static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_limit; - char *start, *end; - __be32 from; - __be16 port; - struct ip_vs_conn *n_cp; - char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ - unsigned buf_len; - int ret; - - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, ')', - &from, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PASV response (%u.%u.%u.%u:%d) -> " - "%u.%u.%u.%u:%d detected\n", - NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); - - /* - * Now update or create an connection entry for it - */ - n_cp = ip_vs_conn_out_get(iph->protocol, from, port, - cp->caddr, 0); - if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - cp->caddr, 0, - cp->vaddr, port, - from, port, - IP_VS_CONN_F_NO_CPORT, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Replace the old passive address with the new one - */ - from = n_cp->vaddr; - port = n_cp->vport; - sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), - (ntohs(port)>>8)&255, ntohs(port)&255); - buf_len = strlen(buf); - - /* - * Calculate required delta-offset to keep TCP happy - */ - *diff = buf_len - (end-start); - - if (*diff == 0) { - /* simply replace it with new passive address */ - memcpy(start, buf, buf_len); - ret = 1; - } else { - ret = !ip_vs_skb_replace(skb, GFP_ATOMIC, start, - end-start, buf, buf_len); - } - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; - } - return 1; -} - - -/* - * Look at incoming ftp packets to catch the PASV/PORT command - * (outside-to-inside). - * - * The incoming packet having the PORT command should be something like - * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". - * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. - * In this case, we create a connection entry using the client address and - * port, so that the active ftp data connection from the server can reach - * the client. - */ -static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) -{ - struct iphdr *iph; - struct tcphdr *th; - char *data, *data_start, *data_limit; - char *start, *end; - __be32 to; - __be16 port; - struct ip_vs_conn *n_cp; - - /* no diff required for incoming packets */ - *diff = 0; - - /* Only useful for established sessions */ - if (cp->state != IP_VS_TCP_S_ESTABLISHED) - return 1; - - /* Linear packets are much easier to deal with. */ - if (!skb_make_writable(skb, skb->len)) - return 0; - - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); - data_limit = skb_tail_pointer(skb); - - while (data <= data_limit - 6) { - if (strnicmp(data, "PASV\r\n", 6) == 0) { - /* Passive mode on */ - IP_VS_DBG(7, "got PASV at %td of %td\n", - data - data_start, - data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; - return 1; - } - data++; - } - - /* - * To support virtual FTP server, the scenerio is as follows: - * FTP client ----> Load Balancer ----> FTP server - * First detect the port number in the application data, - * then create a new connection entry for the coming data - * connection. - */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - '\r', &to, &port, - &start, &end) != 1) - return 1; - - IP_VS_DBG(7, "PORT %u.%u.%u.%u:%d detected\n", - NIPQUAD(to), ntohs(port)); - - /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", - ip_vs_proto_name(iph->protocol), - NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); - - n_cp = ip_vs_conn_in_get(iph->protocol, - to, port, - cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(IPPROTO_TCP, - to, port, - cp->vaddr, htons(ntohs(cp->vport)-1), - cp->daddr, htons(ntohs(cp->dport)-1), - 0, - cp->dest); - if (!n_cp) - return 0; - - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } - - /* - * Move tunnel to listen state - */ - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - - return 1; -} - - -static struct ip_vs_app ip_vs_ftp = { - .name = "ftp", - .type = IP_VS_APP_TYPE_FTP, - .protocol = IPPROTO_TCP, - .module = THIS_MODULE, - .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), - .init_conn = ip_vs_ftp_init_conn, - .done_conn = ip_vs_ftp_done_conn, - .bind_conn = NULL, - .unbind_conn = NULL, - .pkt_out = ip_vs_ftp_out, - .pkt_in = ip_vs_ftp_in, -}; - - -/* - * ip_vs_ftp initialization - */ -static int __init ip_vs_ftp_init(void) -{ - int i, ret; - struct ip_vs_app *app = &ip_vs_ftp; - - ret = register_ip_vs_app(app); - if (ret) - return ret; - - for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { - if (!ports[i]) - continue; - ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); - if (ret) - break; - IP_VS_INFO("%s: loaded support on port[%d] = %d\n", - app->name, i, ports[i]); - } - - if (ret) - unregister_ip_vs_app(app); - - return ret; -} - - -/* - * ip_vs_ftp finish. - */ -static void __exit ip_vs_ftp_exit(void) -{ - unregister_ip_vs_app(&ip_vs_ftp); -} - - -module_init(ip_vs_ftp_init); -module_exit(ip_vs_ftp_exit); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c deleted file mode 100644 index 7a6a319f544a..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ /dev/null @@ -1,571 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection scheduling module - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Martin Hamilton : fixed the terrible locking bugs - * *lock(tbl->lock) ==> *lock(&tbl->lock) - * Wensong Zhang : fixed the uninitilized tbl->lock bug - * Wensong Zhang : added doing full expiration check to - * collect stale entries of 24+ hours when - * no partial expire check in a half hour - * Julian Anastasov : replaced del_timer call with del_timer_sync - * to avoid the possible race between timer - * handler and del_timer thread in SMP - * - */ - -/* - * The lblc algorithm is as follows (pseudo code): - * - * if cachenode[dest_ip] is null then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * else - * n <- cachenode[dest_ip]; - * if (n is dead) OR - * (n.conns>n.weight AND - * there is a node m with m.conns<m.weight/2) then - * n, cachenode[dest_ip] <- {weighted least-conn node}; - * - * return n; - * - * Thanks must go to Wenzhuo Zhang for talking WCCP to me and pushing - * me to write this module. - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/jiffies.h> - -/* for sysctl */ -#include <linux/fs.h> -#include <linux/sysctl.h> - -#include <net/ip_vs.h> - - -/* - * It is for garbage collection of stale IPVS lblc entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblc entry hash table - */ -#ifndef CONFIG_IP_VS_LBLC_TAB_BITS -#define CONFIG_IP_VS_LBLC_TAB_BITS 10 -#endif -#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS -#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) -#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) - - -/* - * IPVS lblc entry represents an association between destination - * IP address and its destination server - */ -struct ip_vs_lblc_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest *dest; /* real server (cache) */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblc hash table - */ -struct ip_vs_lblc_table { - rwlock_t lock; /* lock for this table */ - struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLC sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblc_expiration", - .data = &sysctl_ip_vs_lblc_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -/* - * new/free a ip_vs_lblc_entry, which is a mapping of a destionation - * IP address to a server. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_new(__be32 daddr, struct ip_vs_dest *dest) -{ - struct ip_vs_lblc_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - atomic_inc(&dest->refcnt); - en->dest = dest; - - return en; -} - - -static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) -{ - list_del(&en->list); - /* - * We don't kfree dest because it is refered either by its service - * or the trash dest list. - */ - atomic_dec(&en->dest->refcnt); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLC entry - */ -static inline unsigned ip_vs_lblc_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblc_table. - * returns bool success. - */ -static int -ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) -{ - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Hash by destination IP address - */ - hash = ip_vs_lblc_hashkey(en->addr); - - write_lock(&tbl->lock); - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; -} - - -/* - * Get ip_vs_lblc_entry associated with supplied parameters. - */ -static inline struct ip_vs_lblc_entry * -ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __be32 addr) -{ - unsigned hash; - struct ip_vs_lblc_entry *en; - - hash = ip_vs_lblc_hashkey(addr); - - read_lock(&tbl->lock); - - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; - } - } - - read_unlock(&tbl->lock); - - return NULL; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) -{ - int i; - struct ip_vs_lblc_entry *en, *nxt; - - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } -} - - -static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) -{ - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLC_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, - en->lastuse + sysctl_ip_vs_lblc_expiration)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblc table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblc_check_expire(unsigned long data) -{ - struct ip_vs_lblc_table *tbl; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblc_entry *en, *nxt; - - tbl = (struct ip_vs_lblc_table *)data; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblc_full_check(tbl); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLC_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) - continue; - - ip_vs_lblc_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&tbl->lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - - -static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblc_table *tbl; - - /* - * Allocate the ip_vs_lblc_table for this service - */ - tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblc_table)); - - /* - * Initialize the hash buckets - */ - for (i=0; i<IP_VS_LBLC_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); - } - rwlock_init(&tbl->lock); - tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblc_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); - - return 0; -} - - -static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblc_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblc_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblc_table)); - - return 0; -} - - -static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_lblc_table *tbl; - struct ip_vs_lblc_entry *en; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_lblc_table *)svc->sched_data; - en = ip_vs_lblc_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblc_new(iph->daddr, dest); - if (en == NULL) { - return NULL; - } - ip_vs_lblc_hash(tbl, en); - } else { - dest = en->dest; - if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - atomic_dec(&en->dest->refcnt); - atomic_inc(&dest->refcnt); - en->dest = dest; - } - } - en->lastuse = jiffies; - - IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLC Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblc_scheduler = -{ - .name = "lblc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list), - .init_service = ip_vs_lblc_init_svc, - .done_service = ip_vs_lblc_done_svc, - .update_service = ip_vs_lblc_update_svc, - .schedule = ip_vs_lblc_schedule, -}; - - -static int __init ip_vs_lblc_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblc_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); -} - - -module_init(ip_vs_lblc_init); -module_exit(ip_vs_lblc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c deleted file mode 100644 index c234e73968a6..000000000000 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ /dev/null @@ -1,760 +0,0 @@ -/* - * IPVS: Locality-Based Least-Connection with Replication scheduler - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Julian Anastasov : Added the missing (dest->weight>0) - * condition in the ip_vs_dest_set_max. - * - */ - -/* - * The lblc/r algorithm is as follows (pseudo code): - * - * if serverSet[dest_ip] is null then - * n, serverSet[dest_ip] <- {weighted least-conn node}; - * else - * n <- {least-conn (alive) node in serverSet[dest_ip]}; - * if (n is null) OR - * (n.conns>n.weight AND - * there is a node m with m.conns<m.weight/2) then - * n <- {weighted least-conn node}; - * add n to serverSet[dest_ip]; - * if |serverSet[dest_ip]| > 1 AND - * now - serverSet[dest_ip].lastMod > T then - * m <- {most conn node in serverSet[dest_ip]}; - * remove m from serverSet[dest_ip]; - * if serverSet[dest_ip] changed then - * serverSet[dest_ip].lastMod <- now; - * - * return n; - * - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/jiffies.h> - -/* for sysctl */ -#include <linux/fs.h> -#include <linux/sysctl.h> -#include <net/net_namespace.h> - -#include <net/ip_vs.h> - - -/* - * It is for garbage collection of stale IPVS lblcr entries, - * when the table is full. - */ -#define CHECK_EXPIRE_INTERVAL (60*HZ) -#define ENTRY_TIMEOUT (6*60*HZ) - -/* - * It is for full expiration check. - * When there is no partial expiration check (garbage collection) - * in a half hour, do a full expiration check to collect stale - * entries that haven't been touched for a day. - */ -#define COUNT_FOR_FULL_EXPIRATION 30 -static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; - - -/* - * for IPVS lblcr entry hash table - */ -#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS -#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 -#endif -#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS -#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) -#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) - - -/* - * IPVS destination set structure and operations - */ -struct ip_vs_dest_list { - struct ip_vs_dest_list *next; /* list link */ - struct ip_vs_dest *dest; /* destination server */ -}; - -struct ip_vs_dest_set { - atomic_t size; /* set size */ - unsigned long lastmod; /* last modified time */ - struct ip_vs_dest_list *list; /* destination list */ - rwlock_t lock; /* lock for this list */ -}; - - -static struct ip_vs_dest_list * -ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e; - - for (e=set->list; e!=NULL; e=e->next) { - if (e->dest == dest) - /* already existed */ - return NULL; - } - - e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); - if (e == NULL) { - IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); - return NULL; - } - - atomic_inc(&dest->refcnt); - e->dest = dest; - - /* link it to the list */ - write_lock(&set->lock); - e->next = set->list; - set->list = e; - atomic_inc(&set->size); - write_unlock(&set->lock); - - set->lastmod = jiffies; - return e; -} - -static void -ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - if (e->dest == dest) { - /* HIT */ - *ep = e->next; - atomic_dec(&set->size); - set->lastmod = jiffies; - atomic_dec(&e->dest->refcnt); - kfree(e); - break; - } - ep = &e->next; - } - write_unlock(&set->lock); -} - -static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) -{ - struct ip_vs_dest_list *e, **ep; - - write_lock(&set->lock); - for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { - *ep = e->next; - /* - * We don't kfree dest because it is refered either - * by its service or by the trash dest list. - */ - atomic_dec(&e->dest->refcnt); - kfree(e); - } - write_unlock(&set->lock); -} - -/* get weighted least-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *least; - int loh, doh; - - if (set == NULL) - return NULL; - - read_lock(&set->lock); - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - least = e->dest; - if (least->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if ((atomic_read(&least->weight) > 0) - && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - read_unlock(&set->lock); - return NULL; - - /* find the destination with the weighted least load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if ((loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) - && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { - least = dest; - loh = doh; - } - } - read_unlock(&set->lock); - - IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - return least; -} - - -/* get weighted most-connection node in the destination set */ -static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) -{ - register struct ip_vs_dest_list *e; - struct ip_vs_dest *dest, *most; - int moh, doh; - - if (set == NULL) - return NULL; - - read_lock(&set->lock); - /* select the first destination server, whose weight > 0 */ - for (e=set->list; e!=NULL; e=e->next) { - most = e->dest; - if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); - goto nextstage; - } - } - read_unlock(&set->lock); - return NULL; - - /* find the destination with the weighted most load */ - nextstage: - for (e=e->next; e!=NULL; e=e->next) { - dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ - if ((moh * atomic_read(&dest->weight) < - doh * atomic_read(&most->weight)) - && (atomic_read(&dest->weight) > 0)) { - most = dest; - moh = doh; - } - } - read_unlock(&set->lock); - - IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(most->addr), ntohs(most->port), - atomic_read(&most->activeconns), - atomic_read(&most->refcnt), - atomic_read(&most->weight), moh); - return most; -} - - -/* - * IPVS lblcr entry represents an association between destination - * IP address and its destination server set - */ -struct ip_vs_lblcr_entry { - struct list_head list; - __be32 addr; /* destination IP address */ - struct ip_vs_dest_set set; /* destination server set */ - unsigned long lastuse; /* last used time */ -}; - - -/* - * IPVS lblcr hash table - */ -struct ip_vs_lblcr_table { - rwlock_t lock; /* lock for this table */ - struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ - atomic_t entries; /* number of entries */ - int max_size; /* maximum size of entries */ - struct timer_list periodic_timer; /* collect stale entries */ - int rover; /* rover for expire check */ - int counter; /* counter for no expire */ -}; - - -/* - * IPVS LBLCR sysctl table - */ - -static ctl_table vs_vars_table[] = { - { - .procname = "lblcr_expiration", - .data = &sysctl_ip_vs_lblcr_expiration, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - }, - { .ctl_name = 0 } -}; - -static struct ctl_table_header * sysctl_header; - -/* - * new/free a ip_vs_lblcr_entry, which is a mapping of a destination - * IP address to a server. - */ -static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__be32 daddr) -{ - struct ip_vs_lblcr_entry *en; - - en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); - if (en == NULL) { - IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); - return NULL; - } - - INIT_LIST_HEAD(&en->list); - en->addr = daddr; - - /* initilize its dest set */ - atomic_set(&(en->set.size), 0); - en->set.list = NULL; - rwlock_init(&en->set.lock); - - return en; -} - - -static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) -{ - list_del(&en->list); - ip_vs_dest_set_eraseall(&en->set); - kfree(en); -} - - -/* - * Returns hash value for IPVS LBLCR entry - */ -static inline unsigned ip_vs_lblcr_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; -} - - -/* - * Hash an entry in the ip_vs_lblcr_table. - * returns bool success. - */ -static int -ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) -{ - unsigned hash; - - if (!list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Hash by destination IP address - */ - hash = ip_vs_lblcr_hashkey(en->addr); - - write_lock(&tbl->lock); - list_add(&en->list, &tbl->bucket[hash]); - atomic_inc(&tbl->entries); - write_unlock(&tbl->lock); - - return 1; -} - - -/* - * Get ip_vs_lblcr_entry associated with supplied parameters. - */ -static inline struct ip_vs_lblcr_entry * -ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __be32 addr) -{ - unsigned hash; - struct ip_vs_lblcr_entry *en; - - hash = ip_vs_lblcr_hashkey(addr); - - read_lock(&tbl->lock); - - list_for_each_entry(en, &tbl->bucket[hash], list) { - if (en->addr == addr) { - /* HIT */ - read_unlock(&tbl->lock); - return en; - } - } - - read_unlock(&tbl->lock); - - return NULL; -} - - -/* - * Flush all the entries of the specified table. - */ -static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) -{ - int i; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } -} - - -static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) -{ - unsigned long now = jiffies; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, - now)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - } - write_unlock(&tbl->lock); - } - tbl->rover = j; -} - - -/* - * Periodical timer handler for IPVS lblcr table - * It is used to collect stale entries when the number of entries - * exceeds the maximum size of the table. - * - * Fixme: we probably need more complicated algorithm to collect - * entries that have not been used for a long time even - * if the number of entries doesn't exceed the maximum size - * of the table. - * The full expiration check is for this purpose now. - */ -static void ip_vs_lblcr_check_expire(unsigned long data) -{ - struct ip_vs_lblcr_table *tbl; - unsigned long now = jiffies; - int goal; - int i, j; - struct ip_vs_lblcr_entry *en, *nxt; - - tbl = (struct ip_vs_lblcr_table *)data; - - if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { - /* do full expiration check */ - ip_vs_lblcr_full_check(tbl); - tbl->counter = 1; - goto out; - } - - if (atomic_read(&tbl->entries) <= tbl->max_size) { - tbl->counter++; - goto out; - } - - goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; - if (goal > tbl->max_size/2) - goal = tbl->max_size/2; - - for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { - j = (j + 1) & IP_VS_LBLCR_TAB_MASK; - - write_lock(&tbl->lock); - list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { - if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) - continue; - - ip_vs_lblcr_free(en); - atomic_dec(&tbl->entries); - goal--; - } - write_unlock(&tbl->lock); - if (goal <= 0) - break; - } - tbl->rover = j; - - out: - mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); -} - -static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) -{ - int i; - struct ip_vs_lblcr_table *tbl; - - /* - * Allocate the ip_vs_lblcr_table for this service - */ - tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_lblcr_table)); - - /* - * Initialize the hash buckets - */ - for (i=0; i<IP_VS_LBLCR_TAB_SIZE; i++) { - INIT_LIST_HEAD(&tbl->bucket[i]); - } - rwlock_init(&tbl->lock); - tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; - tbl->rover = 0; - tbl->counter = 1; - - /* - * Hook periodic timer for garbage collection - */ - setup_timer(&tbl->periodic_timer, ip_vs_lblcr_check_expire, - (unsigned long)tbl); - tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; - add_timer(&tbl->periodic_timer); - - return 0; -} - - -static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_lblcr_table *tbl = svc->sched_data; - - /* remove periodic timer */ - del_timer_sync(&tbl->periodic_timer); - - /* got to clean up table entries here */ - ip_vs_lblcr_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_lblcr_table)); - - return 0; -} - - -static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline struct ip_vs_dest * -__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) -{ - struct ip_vs_dest *dest, *least; - int loh, doh; - - /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connection. - */ - list_for_each_entry(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - if (atomic_read(&dest->weight) > 0) { - least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -/* - * If this destination server is overloaded and there is a less loaded - * server, then return true. - */ -static inline int -is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) -{ - if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { - struct ip_vs_dest *d; - - list_for_each_entry(d, &svc->destinations, n_list) { - if (atomic_read(&d->activeconns)*2 - < atomic_read(&d->weight)) { - return 1; - } - } - } - return 0; -} - - -/* - * Locality-Based (weighted) Least-Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_lblcr_table *tbl; - struct ip_vs_lblcr_entry *en; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_lblcr_table *)svc->sched_data; - en = ip_vs_lblcr_get(tbl, iph->daddr); - if (en == NULL) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - en = ip_vs_lblcr_new(iph->daddr); - if (en == NULL) { - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - ip_vs_lblcr_hash(tbl, en); - } else { - dest = ip_vs_dest_set_min(&en->set); - if (!dest || is_overloaded(dest, svc)) { - dest = __ip_vs_wlc_schedule(svc, iph); - if (dest == NULL) { - IP_VS_DBG(1, "no destination available\n"); - return NULL; - } - ip_vs_dest_set_insert(&en->set, dest); - } - if (atomic_read(&en->set.size) > 1 && - jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { - struct ip_vs_dest *m; - m = ip_vs_dest_set_max(&en->set); - if (m) - ip_vs_dest_set_erase(&en->set, m); - } - } - en->lastuse = jiffies; - - IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(en->addr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS LBLCR Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_lblcr_scheduler = -{ - .name = "lblcr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lblcr_scheduler.n_list), - .init_service = ip_vs_lblcr_init_svc, - .done_service = ip_vs_lblcr_done_svc, - .update_service = ip_vs_lblcr_update_svc, - .schedule = ip_vs_lblcr_schedule, -}; - - -static int __init ip_vs_lblcr_init(void) -{ - int ret; - - sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); - ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); - if (ret) - unregister_sysctl_table(sysctl_header); - return ret; -} - - -static void __exit ip_vs_lblcr_cleanup(void) -{ - unregister_sysctl_table(sysctl_header); - unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); -} - - -module_init(ip_vs_lblcr_init); -module_exit(ip_vs_lblcr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c deleted file mode 100644 index ebcdbf75ac65..000000000000 --- a/net/ipv4/ipvs/ip_vs_lc.c +++ /dev/null @@ -1,121 +0,0 @@ -/* - * IPVS: Least-Connection Scheduling module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : added the ip_vs_lc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int ip_vs_lc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_lc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); - - /* - * Simply select the server with the least number of - * (activeconns<<5) + inactconns - * Except whose weight is equal to zero. - * If the weight is equal to zero, it means that the server is - * quiesced, the existing connections to the server still get - * served, but no new connection is assigned to the server. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || - atomic_read(&dest->weight) == 0) - continue; - doh = ip_vs_lc_dest_overhead(dest); - if (!least || doh < loh) { - least = dest; - loh = doh; - } - } - - if (least) - IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->inactconns)); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_lc_scheduler = { - .name = "lc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_lc_scheduler.n_list), - .init_service = ip_vs_lc_init_svc, - .done_service = ip_vs_lc_done_svc, - .update_service = ip_vs_lc_update_svc, - .schedule = ip_vs_lc_schedule, -}; - - -static int __init ip_vs_lc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; -} - -static void __exit ip_vs_lc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); -} - -module_init(ip_vs_lc_init); -module_exit(ip_vs_lc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c deleted file mode 100644 index 92f3a6770031..000000000000 --- a/net/ipv4/ipvs/ip_vs_nq.c +++ /dev/null @@ -1,159 +0,0 @@ -/* - * IPVS: Never Queue scheduling module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The NQ algorithm adopts a two-speed model. When there is an idle server - * available, the job will be sent to the idle server, instead of waiting - * for a fast one. When there is no idle server available, the job will be - * sent to the server that minimize its expected delay (The Shortest - * Expected Delay scheduling algorithm). - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri <marko@buuri.name> for talking NQ to me. - * - * The difference between NQ and SED is that NQ can improve overall - * system utilization. - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int -ip_vs_nq_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_nq_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least = NULL; - unsigned int loh = 0, doh; - - IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - - if (dest->flags & IP_VS_DEST_F_OVERLOAD || - !atomic_read(&dest->weight)) - continue; - - doh = ip_vs_nq_dest_overhead(dest); - - /* return the server directly if it is idle */ - if (atomic_read(&dest->activeconns) == 0) { - least = dest; - loh = doh; - goto out; - } - - if (!least || - (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight))) { - least = dest; - loh = doh; - } - } - - if (!least) - return NULL; - - out: - IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_nq_scheduler = -{ - .name = "nq", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_nq_scheduler.n_list), - .init_service = ip_vs_nq_init_svc, - .done_service = ip_vs_nq_done_svc, - .update_service = ip_vs_nq_update_svc, - .schedule = ip_vs_nq_schedule, -}; - - -static int __init ip_vs_nq_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -static void __exit ip_vs_nq_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); -} - -module_init(ip_vs_nq_init); -module_exit(ip_vs_nq_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c deleted file mode 100644 index 6099a88fc200..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto.c +++ /dev/null @@ -1,233 +0,0 @@ -/* - * ip_vs_proto.c: transport protocol load balancing support for IPVS - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/ip.h> -#include <net/protocol.h> -#include <net/tcp.h> -#include <net/udp.h> -#include <asm/system.h> -#include <linux/stat.h> -#include <linux/proc_fs.h> - -#include <net/ip_vs.h> - - -/* - * IPVS protocols can only be registered/unregistered when the ipvs - * module is loaded/unloaded, so no lock is needed in accessing the - * ipvs protocol table. - */ - -#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ -#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) - -static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; - - -/* - * register an ipvs protocol - */ -static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp->next = ip_vs_proto_table[hash]; - ip_vs_proto_table[hash] = pp; - - if (pp->init != NULL) - pp->init(pp); - - return 0; -} - - -/* - * unregister an ipvs protocol - */ -static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) -{ - struct ip_vs_protocol **pp_p; - unsigned hash = IP_VS_PROTO_HASH(pp->protocol); - - pp_p = &ip_vs_proto_table[hash]; - for (; *pp_p; pp_p = &(*pp_p)->next) { - if (*pp_p == pp) { - *pp_p = pp->next; - if (pp->exit != NULL) - pp->exit(pp); - return 0; - } - } - - return -ESRCH; -} - - -/* - * get ip_vs_protocol object by its proto. - */ -struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) -{ - struct ip_vs_protocol *pp; - unsigned hash = IP_VS_PROTO_HASH(proto); - - for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { - if (pp->protocol == proto) - return pp; - } - - return NULL; -} - - -/* - * Propagate event for state change to all protocols - */ -void ip_vs_protocol_timeout_change(int flags) -{ - struct ip_vs_protocol *pp; - int i; - - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { - if (pp->timeout_change) - pp->timeout_change(pp, flags); - } - } -} - - -int * -ip_vs_create_timeout_table(int *table, int size) -{ - return kmemdup(table, size, GFP_ATOMIC); -} - - -/* - * Set timeout value for state specified by name - */ -int -ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) -{ - int i; - - if (!table || !name || !to) - return -EINVAL; - - for (i = 0; i < num; i++) { - if (strcmp(names[i], name)) - continue; - table[i] = to * HZ; - return 0; - } - return -ENOENT; -} - - -const char * ip_vs_state_name(__u16 proto, int state) -{ - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); - - if (pp == NULL || pp->state_name == NULL) - return (IPPROTO_IP == proto) ? "NONE" : "ERR!"; - return pp->state_name(state); -} - - -void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, - const struct sk_buff *skb, - int offset, - const char *msg) -{ - char buf[128]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else if (ih->frag_off & htons(IP_OFFSET)) - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else { - __be16 _ports[2], *pptr -; - pptr = skb_header_pointer(skb, offset + ih->ihl*4, - sizeof(_ports), _ports); - if (pptr == NULL) - sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, - NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - else - sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", - pp->name, - NIPQUAD(ih->saddr), - ntohs(pptr[0]), - NIPQUAD(ih->daddr), - ntohs(pptr[1])); - } - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -int __init ip_vs_protocol_init(void) -{ - char protocols[64]; -#define REGISTER_PROTOCOL(p) \ - do { \ - register_ip_vs_protocol(p); \ - strcat(protocols, ", "); \ - strcat(protocols, (p)->name); \ - } while (0) - - protocols[0] = '\0'; - protocols[2] = '\0'; -#ifdef CONFIG_IP_VS_PROTO_TCP - REGISTER_PROTOCOL(&ip_vs_protocol_tcp); -#endif -#ifdef CONFIG_IP_VS_PROTO_UDP - REGISTER_PROTOCOL(&ip_vs_protocol_udp); -#endif -#ifdef CONFIG_IP_VS_PROTO_AH - REGISTER_PROTOCOL(&ip_vs_protocol_ah); -#endif -#ifdef CONFIG_IP_VS_PROTO_ESP - REGISTER_PROTOCOL(&ip_vs_protocol_esp); -#endif - IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); - - return 0; -} - - -void ip_vs_protocol_cleanup(void) -{ - struct ip_vs_protocol *pp; - int i; - - /* unregister all the ipvs protocols */ - for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { - while ((pp = ip_vs_proto_table[i]) != NULL) - unregister_ip_vs_protocol(pp); - } -} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c deleted file mode 100644 index 73e0ea87c1f5..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 - * Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -ah_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -ah_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * AH is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void ah_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void ah_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -struct ip_vs_protocol ip_vs_protocol_ah = { - .name = "AH", - .protocol = IPPROTO_AH, - .num_states = 1, - .dont_defrag = 1, - .init = ah_init, - .exit = ah_exit, - .conn_schedule = ah_conn_schedule, - .conn_in_get = ah_conn_in_get, - .conn_out_get = ah_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = ah_debug_packet, - .timeout_change = NULL, /* ISAKMP */ - .set_state_timeout = NULL, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c deleted file mode 100644 index 21d70c8ffa54..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS - * - * Authors: Julian Anastasov <ja@ssi.bg>, February 2002 - * Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation; - * - */ - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -/* TODO: - -struct isakmp_hdr { - __u8 icookie[8]; - __u8 rcookie[8]; - __u8 np; - __u8 version; - __u8 xchgtype; - __u8 flags; - __u32 msgid; - __u32 length; -}; - -*/ - -#define PORT_ISAKMP 500 - - -static struct ip_vs_conn * -esp_conn_in_get(const struct sk_buff *skb, - struct ip_vs_protocol *pp, - const struct iphdr *iph, - unsigned int proto_off, - int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - /* - * We are not sure if the packet is from our - * service, so our conn_schedule hook should return NF_ACCEPT - */ - IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static struct ip_vs_conn * -esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->saddr, - htons(PORT_ISAKMP), - iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(IPPROTO_UDP, - iph->daddr, - htons(PORT_ISAKMP), - iph->saddr, - htons(PORT_ISAKMP)); - } - - if (!cp) { - IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " - "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", - inverse ? "ICMP+" : "", - pp->name, - NIPQUAD(iph->saddr), - NIPQUAD(iph->daddr)); - } - - return cp; -} - - -static int -esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - /* - * ESP is only related traffic. Pass the packet to IP stack. - */ - *verdict = NF_ACCEPT; - return 0; -} - - -static void -esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "%s TRUNCATED", pp->name); - else - sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", - pp->name, NIPQUAD(ih->saddr), - NIPQUAD(ih->daddr)); - - printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); -} - - -static void esp_init(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -static void esp_exit(struct ip_vs_protocol *pp) -{ - /* nothing to do now */ -} - - -struct ip_vs_protocol ip_vs_protocol_esp = { - .name = "ESP", - .protocol = IPPROTO_ESP, - .num_states = 1, - .dont_defrag = 1, - .init = esp_init, - .exit = esp_exit, - .conn_schedule = esp_conn_schedule, - .conn_in_get = esp_conn_in_get, - .conn_out_get = esp_conn_out_get, - .snat_handler = NULL, - .dnat_handler = NULL, - .csum_check = NULL, - .state_transition = NULL, - .register_app = NULL, - .unregister_app = NULL, - .app_conn_bind = NULL, - .debug_packet = esp_debug_packet, - .timeout_change = NULL, /* ISAKMP */ -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c deleted file mode 100644 index d0ea467986a0..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ /dev/null @@ -1,614 +0,0 @@ -/* - * ip_vs_proto_tcp.c: TCP load balancing support for IPVS - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/kernel.h> -#include <linux/ip.h> -#include <linux/tcp.h> /* for tcphdr */ -#include <net/ip.h> -#include <net/tcp.h> /* for csum_tcpudp_magic */ -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -static struct ip_vs_conn * -tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - return ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } -} - -static struct ip_vs_conn * -tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - return ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - return ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } -} - - -static int -tcp_conn_schedule(struct sk_buff *skb, - struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); - if (th == NULL) { - *verdict = NF_DROP; - return 0; - } - - if (th->syn && - (svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, th->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -tcp_fast_csum_update(struct tcphdr *tcph, __be32 oldip, __be32 newip, - __be16 oldport, __be16 newport) -{ - tcph->check = - csum_fold(ip_vs_check_diff4(oldip, newip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(tcph->check)))); -} - - -static int -tcp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - tcph = (void *)ip_hdr(skb) + tcphoff; - tcph->source = cp->vport; - - /* Adjust TCP checksums */ - if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, - skb->len - tcphoff, - cp->protocol, skb->csum); - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, tcph->check, - (char*)&(tcph->check) - (char*)tcph); - } - return 1; -} - - -static int -tcp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct tcphdr *tcph; - const unsigned int tcphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, tcphoff+sizeof(*tcph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn and iph ack_seq stuff - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - tcph = (void *)ip_hdr(skb) + tcphoff; - tcph->dest = cp->dport; - - /* - * Adjust TCP checksums - */ - if (!cp->app) { - /* Only port and addr are changed, do fast csum update */ - tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - tcph->check = 0; - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, - skb->len - tcphoff, - cp->protocol, skb->csum); - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - const unsigned int tcphoff = ip_hdrlen(skb); - - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); - case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, - skb->len - tcphoff, - ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - - return 1; -} - - -#define TCP_DIR_INPUT 0 -#define TCP_DIR_OUTPUT 4 -#define TCP_DIR_INPUT_ONLY 8 - -static const int tcp_state_off[IP_VS_DIR_LAST] = { - [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, - [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, - [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, -}; - -/* - * Timeout table[state] - */ -static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 120*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = "NONE", - [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", - [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", - [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", - [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", - [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", - [IP_VS_TCP_S_CLOSE] = "CLOSE", - [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", - [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", - [IP_VS_TCP_S_LISTEN] = "LISTEN", - [IP_VS_TCP_S_SYNACK] = "SYNACK", - [IP_VS_TCP_S_LAST] = "BUG!", -}; - -#define sNO IP_VS_TCP_S_NONE -#define sES IP_VS_TCP_S_ESTABLISHED -#define sSS IP_VS_TCP_S_SYN_SENT -#define sSR IP_VS_TCP_S_SYN_RECV -#define sFW IP_VS_TCP_S_FIN_WAIT -#define sTW IP_VS_TCP_S_TIME_WAIT -#define sCL IP_VS_TCP_S_CLOSE -#define sCW IP_VS_TCP_S_CLOSE_WAIT -#define sLA IP_VS_TCP_S_LAST_ACK -#define sLI IP_VS_TCP_S_LISTEN -#define sSA IP_VS_TCP_S_SYNACK - -struct tcp_states_t { - int next_state[IP_VS_TCP_S_LAST]; -}; - -static const char * tcp_state_name(int state) -{ - if (state >= IP_VS_TCP_S_LAST) - return "ERR!"; - return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; -} - -static struct tcp_states_t tcp_states [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t tcp_states_dos [] = { -/* INPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, -/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, -/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, - -/* OUTPUT */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, -/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, -/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, -/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, - -/* INPUT-ONLY */ -/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ -/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, -/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, -/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, -/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, -}; - -static struct tcp_states_t *tcp_state_table = tcp_states; - - -static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) -{ - int on = (flags & 1); /* secure_tcp */ - - /* - ** FIXME: change secure_tcp to independent sysctl var - ** or make it per-service or per-app because it is valid - ** for most if not for all of the applications. Something - ** like "capabilities" (flags) for each object. - */ - tcp_state_table = (on? tcp_states_dos : tcp_states); -} - -static int -tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, - tcp_state_name_table, sname, to); -} - -static inline int tcp_state_idx(struct tcphdr *th) -{ - if (th->rst) - return 3; - if (th->syn) - return 0; - if (th->fin) - return 1; - if (th->ack) - return 2; - return -1; -} - -static inline void -set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, - int direction, struct tcphdr *th) -{ - int state_idx; - int new_state = IP_VS_TCP_S_CLOSE; - int state_off = tcp_state_off[direction]; - - /* - * Update state offset to INPUT_ONLY if necessary - * or delete NO_OUTPUT flag if output packet detected - */ - if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { - if (state_off == TCP_DIR_OUTPUT) - cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; - else - state_off = TCP_DIR_INPUT_ONLY; - } - - if ((state_idx = tcp_state_idx(th)) < 0) { - IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); - goto tcp_state_out; - } - - new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; - - tcp_state_out: - if (new_state != cp->state) { - struct ip_vs_dest *dest = cp->dest; - - IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", - pp->name, - (state_off==TCP_DIR_OUTPUT)?"output ":"input ", - th->syn? 'S' : '.', - th->fin? 'F' : '.', - th->ack? 'A' : '.', - th->rst? 'R' : '.', - NIPQUAD(cp->daddr), ntohs(cp->dport), - NIPQUAD(cp->caddr), ntohs(cp->cport), - tcp_state_name(cp->state), - tcp_state_name(new_state), - atomic_read(&cp->refcnt)); - if (dest) { - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (new_state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - } - - cp->timeout = pp->timeout_table[cp->state = new_state]; -} - - -/* - * Handle state transitions - */ -static int -tcp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - struct tcphdr _tcph, *th; - - th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); - if (th == NULL) - return 0; - - spin_lock(&cp->lock); - set_tcp_state(pp, cp, direction, th); - spin_unlock(&cp->lock); - - return 1; -} - - -/* - * Hash table for TCP application incarnations - */ -#define TCP_APP_TAB_BITS 4 -#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) -#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) - -static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(tcp_app_lock); - -static inline __u16 tcp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) - & TCP_APP_TAB_MASK; -} - - -static int tcp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = tcp_app_hashkey(port); - - spin_lock_bh(&tcp_app_lock); - list_for_each_entry(i, &tcp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &tcp_apps[hash]); - atomic_inc(&ip_vs_protocol_tcp.appcnt); - - out: - spin_unlock_bh(&tcp_app_lock); - return ret; -} - - -static void -tcp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&tcp_app_lock); - atomic_dec(&ip_vs_protocol_tcp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&tcp_app_lock); -} - - -static int -tcp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = tcp_app_hashkey(cp->vport); - - spin_lock(&tcp_app_lock); - list_for_each_entry(inc, &tcp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&tcp_app_lock); - - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - inc->name, ntohs(inc->port)); - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&tcp_app_lock); - - out: - return result; -} - - -/* - * Set LISTEN timeout. (ip_vs_conn_put will setup timer) - */ -void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) -{ - spin_lock(&cp->lock); - cp->state = IP_VS_TCP_S_LISTEN; - cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; - spin_unlock(&cp->lock); -} - - -static void ip_vs_tcp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(tcp_apps); - pp->timeout_table = tcp_timeouts; -} - - -static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_tcp = { - .name = "TCP", - .protocol = IPPROTO_TCP, - .num_states = IP_VS_TCP_S_LAST, - .dont_defrag = 0, - .appcnt = ATOMIC_INIT(0), - .init = ip_vs_tcp_init, - .exit = ip_vs_tcp_exit, - .register_app = tcp_register_app, - .unregister_app = tcp_unregister_app, - .conn_schedule = tcp_conn_schedule, - .conn_in_get = tcp_conn_in_get, - .conn_out_get = tcp_conn_out_get, - .snat_handler = tcp_snat_handler, - .dnat_handler = tcp_dnat_handler, - .csum_check = tcp_csum_check, - .state_name = tcp_state_name, - .state_transition = tcp_state_transition, - .app_conn_bind = tcp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = tcp_timeout_change, - .set_state_timeout = tcp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c deleted file mode 100644 index c6be5d56823f..000000000000 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * ip_vs_proto_udp.c: UDP load balancing support for IPVS - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/in.h> -#include <linux/ip.h> -#include <linux/kernel.h> -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> -#include <linux/udp.h> - -#include <net/ip_vs.h> -#include <net/ip.h> - -static struct ip_vs_conn * -udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_in_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } - - return cp; -} - - -static struct ip_vs_conn * -udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, - const struct iphdr *iph, unsigned int proto_off, int inverse) -{ - struct ip_vs_conn *cp; - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_ports), _ports); - if (pptr == NULL) - return NULL; - - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(iph->protocol, - iph->saddr, pptr[0], - iph->daddr, pptr[1]); - } else { - cp = ip_vs_conn_out_get(iph->protocol, - iph->daddr, pptr[1], - iph->saddr, pptr[0]); - } - - return cp; -} - - -static int -udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, - int *verdict, struct ip_vs_conn **cpp) -{ - struct ip_vs_service *svc; - struct udphdr _udph, *uh; - - uh = skb_header_pointer(skb, ip_hdrlen(skb), - sizeof(_udph), &_udph); - if (uh == NULL) { - *verdict = NF_DROP; - return 0; - } - - if ((svc = ip_vs_service_get(skb->mark, ip_hdr(skb)->protocol, - ip_hdr(skb)->daddr, uh->dest))) { - if (ip_vs_todrop()) { - /* - * It seems that we are very loaded. - * We have to drop this packet :( - */ - ip_vs_service_put(svc); - *verdict = NF_DROP; - return 0; - } - - /* - * Let the virtual server select a real server for the - * incoming connection, and create a connection entry. - */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { - *verdict = ip_vs_leave(svc, skb, pp); - return 0; - } - ip_vs_service_put(svc); - } - return 1; -} - - -static inline void -udp_fast_csum_update(struct udphdr *uhdr, __be32 oldip, __be32 newip, - __be16 oldport, __be16 newport) -{ - uhdr->check = - csum_fold(ip_vs_check_diff4(oldip, newip, - ip_vs_check_diff2(oldport, newport, - ~csum_unfold(uhdr->check)))); - if (!uhdr->check) - uhdr->check = CSUM_MANGLED_0; -} - -static int -udp_snat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - const unsigned int udphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* - * Call application helper if needed - */ - if (!ip_vs_app_pkt_out(cp, skb)) - return 0; - } - - udph = (void *)ip_hdr(skb) + udphoff; - udph->source = cp->vport; - - /* - * Adjust UDP checksums - */ - if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->daddr, cp->vaddr, - cp->dport, cp->vport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, - skb->len - udphoff, - cp->protocol, skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", - pp->name, udph->check, - (char*)&(udph->check) - (char*)udph); - } - return 1; -} - - -static int -udp_dnat_handler(struct sk_buff *skb, - struct ip_vs_protocol *pp, struct ip_vs_conn *cp) -{ - struct udphdr *udph; - unsigned int udphoff = ip_hdrlen(skb); - - /* csum_check requires unshared skb */ - if (!skb_make_writable(skb, udphoff+sizeof(*udph))) - return 0; - - if (unlikely(cp->app != NULL)) { - /* Some checks before mangling */ - if (pp->csum_check && !pp->csum_check(skb, pp)) - return 0; - - /* - * Attempt ip_vs_app call. - * It will fix ip_vs_conn - */ - if (!ip_vs_app_pkt_in(cp, skb)) - return 0; - } - - udph = (void *)ip_hdr(skb) + udphoff; - udph->dest = cp->dport; - - /* - * Adjust UDP checksums - */ - if (!cp->app && (udph->check != 0)) { - /* Only port and addr are changed, do fast csum update */ - udp_fast_csum_update(udph, cp->vaddr, cp->daddr, - cp->vport, cp->dport); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; - } else { - /* full checksum calculation */ - udph->check = 0; - skb->csum = skb_checksum(skb, udphoff, skb->len - udphoff, 0); - udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, - skb->len - udphoff, - cp->protocol, skb->csum); - if (udph->check == 0) - udph->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_UNNECESSARY; - } - return 1; -} - - -static int -udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) -{ - struct udphdr _udph, *uh; - const unsigned int udphoff = ip_hdrlen(skb); - - uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); - if (uh == NULL) - return 0; - - if (uh->check != 0) { - switch (skb->ip_summed) { - case CHECKSUM_NONE: - skb->csum = skb_checksum(skb, udphoff, - skb->len - udphoff, 0); - case CHECKSUM_COMPLETE: - if (csum_tcpudp_magic(ip_hdr(skb)->saddr, - ip_hdr(skb)->daddr, - skb->len - udphoff, - ip_hdr(skb)->protocol, - skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, - "Failed checksum for"); - return 0; - } - break; - default: - /* No need to checksum. */ - break; - } - } - return 1; -} - - -/* - * Note: the caller guarantees that only one of register_app, - * unregister_app or app_conn_bind is called each time. - */ - -#define UDP_APP_TAB_BITS 4 -#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) -#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) - -static struct list_head udp_apps[UDP_APP_TAB_SIZE]; -static DEFINE_SPINLOCK(udp_app_lock); - -static inline __u16 udp_app_hashkey(__be16 port) -{ - return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) - & UDP_APP_TAB_MASK; -} - - -static int udp_register_app(struct ip_vs_app *inc) -{ - struct ip_vs_app *i; - __u16 hash; - __be16 port = inc->port; - int ret = 0; - - hash = udp_app_hashkey(port); - - - spin_lock_bh(&udp_app_lock); - list_for_each_entry(i, &udp_apps[hash], p_list) { - if (i->port == port) { - ret = -EEXIST; - goto out; - } - } - list_add(&inc->p_list, &udp_apps[hash]); - atomic_inc(&ip_vs_protocol_udp.appcnt); - - out: - spin_unlock_bh(&udp_app_lock); - return ret; -} - - -static void -udp_unregister_app(struct ip_vs_app *inc) -{ - spin_lock_bh(&udp_app_lock); - atomic_dec(&ip_vs_protocol_udp.appcnt); - list_del(&inc->p_list); - spin_unlock_bh(&udp_app_lock); -} - - -static int udp_app_conn_bind(struct ip_vs_conn *cp) -{ - int hash; - struct ip_vs_app *inc; - int result = 0; - - /* Default binding: bind app only for NAT */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) - return 0; - - /* Lookup application incarnations and bind the right one */ - hash = udp_app_hashkey(cp->vport); - - spin_lock(&udp_app_lock); - list_for_each_entry(inc, &udp_apps[hash], p_list) { - if (inc->port == cp->vport) { - if (unlikely(!ip_vs_app_inc_get(inc))) - break; - spin_unlock(&udp_app_lock); - - IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" - "%u.%u.%u.%u:%u to app %s on port %u\n", - __func__, - NIPQUAD(cp->caddr), ntohs(cp->cport), - NIPQUAD(cp->vaddr), ntohs(cp->vport), - inc->name, ntohs(inc->port)); - cp->app = inc; - if (inc->init_conn) - result = inc->init_conn(inc, cp); - goto out; - } - } - spin_unlock(&udp_app_lock); - - out: - return result; -} - - -static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = 5*60*HZ, - [IP_VS_UDP_S_LAST] = 2*HZ, -}; - -static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { - [IP_VS_UDP_S_NORMAL] = "UDP", - [IP_VS_UDP_S_LAST] = "BUG!", -}; - - -static int -udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) -{ - return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, - udp_state_name_table, sname, to); -} - -static const char * udp_state_name(int state) -{ - if (state >= IP_VS_UDP_S_LAST) - return "ERR!"; - return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; -} - -static int -udp_state_transition(struct ip_vs_conn *cp, int direction, - const struct sk_buff *skb, - struct ip_vs_protocol *pp) -{ - cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; - return 1; -} - -static void udp_init(struct ip_vs_protocol *pp) -{ - IP_VS_INIT_HASH_TABLE(udp_apps); - pp->timeout_table = udp_timeouts; -} - -static void udp_exit(struct ip_vs_protocol *pp) -{ -} - - -struct ip_vs_protocol ip_vs_protocol_udp = { - .name = "UDP", - .protocol = IPPROTO_UDP, - .num_states = IP_VS_UDP_S_LAST, - .dont_defrag = 0, - .init = udp_init, - .exit = udp_exit, - .conn_schedule = udp_conn_schedule, - .conn_in_get = udp_conn_in_get, - .conn_out_get = udp_conn_out_get, - .snat_handler = udp_snat_handler, - .dnat_handler = udp_dnat_handler, - .csum_check = udp_csum_check, - .state_transition = udp_state_transition, - .state_name = udp_state_name, - .register_app = udp_register_app, - .unregister_app = udp_unregister_app, - .app_conn_bind = udp_app_conn_bind, - .debug_packet = ip_vs_tcpudp_debug_packet, - .timeout_change = NULL, - .set_state_timeout = udp_set_state_timeout, -}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c deleted file mode 100644 index 358110d17e59..000000000000 --- a/net/ipv4/ipvs/ip_vs_rr.c +++ /dev/null @@ -1,116 +0,0 @@ -/* - * IPVS: Round-Robin Scheduling module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Fixes/Changes: - * Wensong Zhang : changed the ip_vs_rr_schedule to return dest - * Julian Anastasov : fixed the NULL pointer access bug in debugging - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_rr_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int ip_vs_rr_init_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -static int ip_vs_rr_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int ip_vs_rr_update_svc(struct ip_vs_service *svc) -{ - svc->sched_data = &svc->destinations; - return 0; -} - - -/* - * Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct list_head *p, *q; - struct ip_vs_dest *dest; - - IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); - - write_lock(&svc->sched_lock); - p = (struct list_head *)svc->sched_data; - p = p->next; - q = p; - do { - /* skip list head */ - if (q == &svc->destinations) { - q = q->next; - continue; - } - - dest = list_entry(q, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) - /* HIT */ - goto out; - q = q->next; - } while (q != p); - write_unlock(&svc->sched_lock); - return NULL; - - out: - svc->sched_data = q; - write_unlock(&svc->sched_lock); - IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); - - return dest; -} - - -static struct ip_vs_scheduler ip_vs_rr_scheduler = { - .name = "rr", /* name */ - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_rr_scheduler.n_list), - .init_service = ip_vs_rr_init_svc, - .done_service = ip_vs_rr_done_svc, - .update_service = ip_vs_rr_update_svc, - .schedule = ip_vs_rr_schedule, -}; - -static int __init ip_vs_rr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -static void __exit ip_vs_rr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); -} - -module_init(ip_vs_rr_init); -module_exit(ip_vs_rr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c deleted file mode 100644 index a46ad9e35016..000000000000 --- a/net/ipv4/ipvs/ip_vs_sched.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the Netfilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/module.h> -#include <linux/spinlock.h> -#include <linux/interrupt.h> -#include <asm/string.h> -#include <linux/kmod.h> -#include <linux/sysctl.h> - -#include <net/ip_vs.h> - -/* - * IPVS scheduler list - */ -static LIST_HEAD(ip_vs_schedulers); - -/* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); - - -/* - * Bind a service with a scheduler - */ -int ip_vs_bind_scheduler(struct ip_vs_service *svc, - struct ip_vs_scheduler *scheduler) -{ - int ret; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - if (scheduler == NULL) { - IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); - return -EINVAL; - } - - svc->scheduler = scheduler; - - if (scheduler->init_service) { - ret = scheduler->init_service(svc); - if (ret) { - IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); - return ret; - } - } - - return 0; -} - - -/* - * Unbind a service with its scheduler - */ -int ip_vs_unbind_scheduler(struct ip_vs_service *svc) -{ - struct ip_vs_scheduler *sched; - - if (svc == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); - return -EINVAL; - } - - if (sched->done_service) { - if (sched->done_service(svc) != 0) { - IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); - return -EINVAL; - } - } - - svc->scheduler = NULL; - return 0; -} - - -/* - * Get scheduler in the scheduler list by name - */ -static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", - sched_name); - - read_lock_bh(&__ip_vs_sched_lock); - - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - /* - * Test and get the modules atomically - */ - if (sched->module && !try_module_get(sched->module)) { - /* - * This scheduler is just deleted - */ - continue; - } - if (strcmp(sched_name, sched->name)==0) { - /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); - return sched; - } - if (sched->module) - module_put(sched->module); - } - - read_unlock_bh(&__ip_vs_sched_lock); - return NULL; -} - - -/* - * Lookup scheduler and try to load it if it doesn't exist - */ -struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) -{ - struct ip_vs_scheduler *sched; - - /* - * Search for the scheduler by sched_name - */ - sched = ip_vs_sched_getbyname(sched_name); - - /* - * If scheduler not found, load the module and search again - */ - if (sched == NULL) { - request_module("ip_vs_%s", sched_name); - sched = ip_vs_sched_getbyname(sched_name); - } - - return sched; -} - -void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) -{ - if (scheduler->module) - module_put(scheduler->module); -} - - -/* - * Register a scheduler in the scheduler list - */ -int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - struct ip_vs_scheduler *sched; - - if (!scheduler) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - if (!scheduler->name) { - IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); - return -EINVAL; - } - - /* increase the module use count */ - ip_vs_use_count_inc(); - - write_lock_bh(&__ip_vs_sched_lock); - - if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already linked\n", scheduler->name); - return -EINVAL; - } - - /* - * Make sure that the scheduler with this name doesn't exist - * in the scheduler list. - */ - list_for_each_entry(sched, &ip_vs_schedulers, n_list) { - if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); - ip_vs_use_count_dec(); - IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " - "already existed in the system\n", - scheduler->name); - return -EINVAL; - } - } - /* - * Add it into the d-linked scheduler list - */ - list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); - - IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); - - return 0; -} - - -/* - * Unregister a scheduler from the scheduler list - */ -int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) -{ - if (!scheduler) { - IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); - return -EINVAL; - } - - write_lock_bh(&__ip_vs_sched_lock); - if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); - IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " - "is not in the list. failed\n", scheduler->name); - return -EINVAL; - } - - /* - * Remove it from the d-linked scheduler list - */ - list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c deleted file mode 100644 index 77663d84cbd1..000000000000 --- a/net/ipv4/ipvs/ip_vs_sed.c +++ /dev/null @@ -1,161 +0,0 @@ -/* - * IPVS: Shortest Expected Delay scheduling module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The SED algorithm attempts to minimize each job's expected delay until - * completion. The expected delay that the job will experience is - * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of - * jobs on the ith server and Ui is the fixed service rate (weight) of - * the ith server. The SED algorithm adopts a greedy policy that each does - * what is in its own best interest, i.e. to join the queue which would - * minimize its expected delay of completion. - * - * See the following paper for more information: - * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing - * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, - * pages 986-994, 1988. - * - * Thanks must go to Marko Buuri <marko@buuri.name> for talking SED to me. - * - * The difference between SED and WLC is that SED includes the incoming - * job in the cost function (the increment of 1). SED may outperform - * WLC, while scheduling big jobs under larger heterogeneous systems - * (the server weight varies a lot). - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int -ip_vs_sed_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_sed_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We only use the active connection number in the cost - * calculation here. - */ - return atomic_read(&dest->activeconns) + 1; -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (server expected overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_sed_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_sed_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_sed_scheduler = -{ - .name = "sed", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sed_scheduler.n_list), - .init_service = ip_vs_sed_init_svc, - .done_service = ip_vs_sed_done_svc, - .update_service = ip_vs_sed_update_svc, - .schedule = ip_vs_sed_schedule, -}; - - -static int __init ip_vs_sed_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -static void __exit ip_vs_sed_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); -} - -module_init(ip_vs_sed_init); -module_exit(ip_vs_sed_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c deleted file mode 100644 index 7b979e228056..000000000000 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ /dev/null @@ -1,255 +0,0 @@ -/* - * IPVS: Source Hashing scheduling module - * - * Authors: Wensong Zhang <wensong@gnuchina.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -/* - * The sh algorithm is to select server by the hash key of source IP - * address. The pseudo code is as follows: - * - * n <- servernode[src_ip]; - * if (n is dead) OR - * (n is overloaded) or (n.weight <= 0) then - * return NULL; - * - * return n; - * - * Notes that servernode is a 256-bucket hash table that maps the hash - * index derived from packet source IP address to the current server - * array. If the sh scheduler is used in cache cluster, it is good to - * combine it with cache_bypass feature. When the statically assigned - * server is dead or overloaded, the load balancer can bypass the cache - * server and send requests to the original server directly. - * - */ - -#include <linux/ip.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/skbuff.h> - -#include <net/ip_vs.h> - - -/* - * IPVS SH bucket - */ -struct ip_vs_sh_bucket { - struct ip_vs_dest *dest; /* real server (cache) */ -}; - -/* - * for IPVS SH entry hash table - */ -#ifndef CONFIG_IP_VS_SH_TAB_BITS -#define CONFIG_IP_VS_SH_TAB_BITS 8 -#endif -#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS -#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) -#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) - - -/* - * Returns hash value for IPVS SH entry - */ -static inline unsigned ip_vs_sh_hashkey(__be32 addr) -{ - return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; -} - - -/* - * Get ip_vs_dest associated with supplied parameters. - */ -static inline struct ip_vs_dest * -ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __be32 addr) -{ - return (tbl[ip_vs_sh_hashkey(addr)]).dest; -} - - -/* - * Assign all the hash buckets of the specified table with the service. - */ -static int -ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) -{ - int i; - struct ip_vs_sh_bucket *b; - struct list_head *p; - struct ip_vs_dest *dest; - - b = tbl; - p = &svc->destinations; - for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (list_empty(p)) { - b->dest = NULL; - } else { - if (p == &svc->destinations) - p = p->next; - - dest = list_entry(p, struct ip_vs_dest, n_list); - atomic_inc(&dest->refcnt); - b->dest = dest; - - p = p->next; - } - b++; - } - return 0; -} - - -/* - * Flush all the hash buckets of the specified table. - */ -static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) -{ - int i; - struct ip_vs_sh_bucket *b; - - b = tbl; - for (i=0; i<IP_VS_SH_TAB_SIZE; i++) { - if (b->dest) { - atomic_dec(&b->dest->refcnt); - b->dest = NULL; - } - b++; - } -} - - -static int ip_vs_sh_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl; - - /* allocate the SH table for this service */ - tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, - GFP_ATOMIC); - if (tbl == NULL) { - IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); - return -ENOMEM; - } - svc->sched_data = tbl; - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " - "current service\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -static int ip_vs_sh_done_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* release the table itself */ - kfree(svc->sched_data); - IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", - sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); - - return 0; -} - - -static int ip_vs_sh_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_sh_bucket *tbl = svc->sched_data; - - /* got to clean up hash buckets here */ - ip_vs_sh_flush(tbl); - - /* assign the hash buckets with the updated service */ - ip_vs_sh_assign(tbl, svc); - - return 0; -} - - -/* - * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, - * consider that the server is overloaded here. - */ -static inline int is_overloaded(struct ip_vs_dest *dest) -{ - return dest->flags & IP_VS_DEST_F_OVERLOAD; -} - - -/* - * Source Hashing scheduling - */ -static struct ip_vs_dest * -ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_sh_bucket *tbl; - struct iphdr *iph = ip_hdr(skb); - - IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); - - tbl = (struct ip_vs_sh_bucket *)svc->sched_data; - dest = ip_vs_sh_get(tbl, iph->saddr); - if (!dest - || !(dest->flags & IP_VS_DEST_F_AVAILABLE) - || atomic_read(&dest->weight) <= 0 - || is_overloaded(dest)) { - return NULL; - } - - IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " - "--> server %u.%u.%u.%u:%d\n", - NIPQUAD(iph->saddr), - NIPQUAD(dest->addr), - ntohs(dest->port)); - - return dest; -} - - -/* - * IPVS SH Scheduler structure - */ -static struct ip_vs_scheduler ip_vs_sh_scheduler = -{ - .name = "sh", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_sh_scheduler.n_list), - .init_service = ip_vs_sh_init_svc, - .done_service = ip_vs_sh_done_svc, - .update_service = ip_vs_sh_update_svc, - .schedule = ip_vs_sh_schedule, -}; - - -static int __init ip_vs_sh_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -static void __exit ip_vs_sh_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); -} - - -module_init(ip_vs_sh_init); -module_exit(ip_vs_sh_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c deleted file mode 100644 index a652da2c3200..000000000000 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ /dev/null @@ -1,930 +0,0 @@ -/* - * IPVS An implementation of the IP virtual server support for the - * LINUX operating system. IPVS is now implemented as a module - * over the NetFilter framework. IPVS can be used to build a - * high-performance and highly available server based on a - * cluster of servers. - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * ip_vs_sync: sync connection info from master load balancer to backups - * through multicast - * - * Changes: - * Alexandre Cassen : Added master & backup support at a time. - * Alexandre Cassen : Added SyncID support for incoming sync - * messages filtering. - * Justin Ossevoort : Fix endian problem on sync message size. - */ - -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/inetdevice.h> -#include <linux/net.h> -#include <linux/completion.h> -#include <linux/delay.h> -#include <linux/skbuff.h> -#include <linux/in.h> -#include <linux/igmp.h> /* for ip_mc_join_group */ -#include <linux/udp.h> -#include <linux/err.h> -#include <linux/kthread.h> -#include <linux/wait.h> - -#include <net/ip.h> -#include <net/sock.h> - -#include <net/ip_vs.h> - -#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ -#define IP_VS_SYNC_PORT 8848 /* multicast port */ - - -/* - * IPVS sync connection entry - */ -struct ip_vs_sync_conn { - __u8 reserved; - - /* Protocol, addresses and port numbers */ - __u8 protocol; /* Which protocol (TCP/UDP) */ - __be16 cport; - __be16 vport; - __be16 dport; - __be32 caddr; /* client address */ - __be32 vaddr; /* virtual address */ - __be32 daddr; /* destination address */ - - /* Flags and state transition */ - __be16 flags; /* status flags */ - __be16 state; /* state info */ - - /* The sequence options start here */ -}; - -struct ip_vs_sync_conn_options { - struct ip_vs_seq in_seq; /* incoming seq. struct */ - struct ip_vs_seq out_seq; /* outgoing seq. struct */ -}; - -struct ip_vs_sync_thread_data { - struct socket *sock; - char *buf; -}; - -#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) -#define FULL_CONN_SIZE \ -(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) - - -/* - The master mulitcasts messages to the backup load balancers in the - following format. - - 0 1 2 3 - 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | Count Conns | SyncID | Size | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (1) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | . | - | . | - | . | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ - | | - | IPVS Sync Connection (n) | - +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -*/ - -#define SYNC_MESG_HEADER_LEN 4 - -struct ip_vs_sync_mesg { - __u8 nr_conns; - __u8 syncid; - __u16 size; - - /* ip_vs_sync_conn entries start here */ -}; - -/* the maximum length of sync (sending/receiving) message */ -static int sync_send_mesg_maxlen; -static int sync_recv_mesg_maxlen; - -struct ip_vs_sync_buff { - struct list_head list; - unsigned long firstuse; - - /* pointers for the message data */ - struct ip_vs_sync_mesg *mesg; - unsigned char *head; - unsigned char *end; -}; - - -/* the sync_buff list head and the lock */ -static LIST_HEAD(ip_vs_sync_queue); -static DEFINE_SPINLOCK(ip_vs_sync_lock); - -/* current sync_buff for accepting new conn entries */ -static struct ip_vs_sync_buff *curr_sb = NULL; -static DEFINE_SPINLOCK(curr_sb_lock); - -/* ipvs sync daemon state */ -volatile int ip_vs_sync_state = IP_VS_STATE_NONE; -volatile int ip_vs_master_syncid = 0; -volatile int ip_vs_backup_syncid = 0; - -/* multicast interface name */ -char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; -char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; - -/* sync daemon tasks */ -static struct task_struct *sync_master_thread; -static struct task_struct *sync_backup_thread; - -/* multicast addr */ -static struct sockaddr_in mcast_addr = { - .sin_family = AF_INET, - .sin_port = __constant_htons(IP_VS_SYNC_PORT), - .sin_addr.s_addr = __constant_htonl(IP_VS_SYNC_GROUP), -}; - - -static inline struct ip_vs_sync_buff *sb_dequeue(void) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&ip_vs_sync_lock); - if (list_empty(&ip_vs_sync_queue)) { - sb = NULL; - } else { - sb = list_entry(ip_vs_sync_queue.next, - struct ip_vs_sync_buff, - list); - list_del(&sb->list); - } - spin_unlock_bh(&ip_vs_sync_lock); - - return sb; -} - -static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) -{ - struct ip_vs_sync_buff *sb; - - if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) - return NULL; - - if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { - kfree(sb); - return NULL; - } - sb->mesg->nr_conns = 0; - sb->mesg->syncid = ip_vs_master_syncid; - sb->mesg->size = 4; - sb->head = (unsigned char *)sb->mesg + 4; - sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; - sb->firstuse = jiffies; - return sb; -} - -static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) -{ - kfree(sb->mesg); - kfree(sb); -} - -static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) -{ - spin_lock(&ip_vs_sync_lock); - if (ip_vs_sync_state & IP_VS_STATE_MASTER) - list_add_tail(&sb->list, &ip_vs_sync_queue); - else - ip_vs_sync_buff_release(sb); - spin_unlock(&ip_vs_sync_lock); -} - -/* - * Get the current sync buffer if it has been created for more - * than the specified time or the specified time is zero. - */ -static inline struct ip_vs_sync_buff * -get_curr_sync_buff(unsigned long time) -{ - struct ip_vs_sync_buff *sb; - - spin_lock_bh(&curr_sb_lock); - if (curr_sb && (time == 0 || - time_before(jiffies - curr_sb->firstuse, time))) { - sb = curr_sb; - curr_sb = NULL; - } else - sb = NULL; - spin_unlock_bh(&curr_sb_lock); - return sb; -} - - -/* - * Add an ip_vs_conn information into the current sync_buff. - * Called by ip_vs_in. - */ -void ip_vs_sync_conn(struct ip_vs_conn *cp) -{ - struct ip_vs_sync_mesg *m; - struct ip_vs_sync_conn *s; - int len; - - spin_lock(&curr_sb_lock); - if (!curr_sb) { - if (!(curr_sb=ip_vs_sync_buff_create())) { - spin_unlock(&curr_sb_lock); - IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); - return; - } - } - - len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : - SIMPLE_CONN_SIZE; - m = curr_sb->mesg; - s = (struct ip_vs_sync_conn *)curr_sb->head; - - /* copy members */ - s->protocol = cp->protocol; - s->cport = cp->cport; - s->vport = cp->vport; - s->dport = cp->dport; - s->caddr = cp->caddr; - s->vaddr = cp->vaddr; - s->daddr = cp->daddr; - s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); - s->state = htons(cp->state); - if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { - struct ip_vs_sync_conn_options *opt = - (struct ip_vs_sync_conn_options *)&s[1]; - memcpy(opt, &cp->in_seq, sizeof(*opt)); - } - - m->nr_conns++; - m->size += len; - curr_sb->head += len; - - /* check if there is a space for next one */ - if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { - sb_queue_tail(curr_sb); - curr_sb = NULL; - } - spin_unlock(&curr_sb_lock); - - /* synchronize its controller if it has */ - if (cp->control) - ip_vs_sync_conn(cp->control); -} - - -/* - * Process received multicast message and create the corresponding - * ip_vs_conn entries. - */ -static void ip_vs_process_message(const char *buffer, const size_t buflen) -{ - struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; - struct ip_vs_sync_conn *s; - struct ip_vs_sync_conn_options *opt; - struct ip_vs_conn *cp; - struct ip_vs_protocol *pp; - struct ip_vs_dest *dest; - char *p; - int i; - - if (buflen < sizeof(struct ip_vs_sync_mesg)) { - IP_VS_ERR_RL("sync message header too short\n"); - return; - } - - /* Convert size back to host byte order */ - m->size = ntohs(m->size); - - if (buflen != m->size) { - IP_VS_ERR_RL("bogus sync message size\n"); - return; - } - - /* SyncID sanity check */ - if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { - IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", - m->syncid); - return; - } - - p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); - for (i=0; i<m->nr_conns; i++) { - unsigned flags, state; - - if (p + SIMPLE_CONN_SIZE > buffer+buflen) { - IP_VS_ERR_RL("bogus conn in sync message\n"); - return; - } - s = (struct ip_vs_sync_conn *) p; - flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; - flags &= ~IP_VS_CONN_F_HASHED; - if (flags & IP_VS_CONN_F_SEQ_MASK) { - opt = (struct ip_vs_sync_conn_options *)&s[1]; - p += FULL_CONN_SIZE; - if (p > buffer+buflen) { - IP_VS_ERR_RL("bogus conn options in sync message\n"); - return; - } - } else { - opt = NULL; - p += SIMPLE_CONN_SIZE; - } - - state = ntohs(s->state); - if (!(flags & IP_VS_CONN_F_TEMPLATE)) { - pp = ip_vs_proto_get(s->protocol); - if (!pp) { - IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", - s->protocol); - continue; - } - if (state >= pp->num_states) { - IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", - pp->name, state); - continue; - } - } else { - /* protocol in templates is not used for state/timeout */ - pp = NULL; - if (state > 0) { - IP_VS_DBG(2, "Invalid template state %u in sync msg\n", - state); - state = 0; - } - } - - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); - else - cp = ip_vs_ct_in_get(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport); - if (!cp) { - /* - * Find the appropriate destination for the connection. - * If it is not found the connection will remain unbound - * but still handled. - */ - dest = ip_vs_find_dest(s->daddr, s->dport, - s->vaddr, s->vport, - s->protocol); - /* Set the approprite ativity flag */ - if (s->protocol == IPPROTO_TCP) { - if (state != IP_VS_TCP_S_ESTABLISHED) - flags |= IP_VS_CONN_F_INACTIVE; - else - flags &= ~IP_VS_CONN_F_INACTIVE; - } - cp = ip_vs_conn_new(s->protocol, - s->caddr, s->cport, - s->vaddr, s->vport, - s->daddr, s->dport, - flags, dest); - if (dest) - atomic_dec(&dest->refcnt); - if (!cp) { - IP_VS_ERR("ip_vs_conn_new failed\n"); - return; - } - } else if (!cp->dest) { - dest = ip_vs_try_bind_dest(cp); - if (dest) - atomic_dec(&dest->refcnt); - } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) && - (cp->state != state)) { - /* update active/inactive flag for the connection */ - dest = cp->dest; - if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && - (state != IP_VS_TCP_S_ESTABLISHED)) { - atomic_dec(&dest->activeconns); - atomic_inc(&dest->inactconns); - cp->flags |= IP_VS_CONN_F_INACTIVE; - } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && - (state == IP_VS_TCP_S_ESTABLISHED)) { - atomic_inc(&dest->activeconns); - atomic_dec(&dest->inactconns); - cp->flags &= ~IP_VS_CONN_F_INACTIVE; - } - } - - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); - atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); - cp->state = state; - cp->old_state = cp->state; - /* - * We can not recover the right timeout for templates - * in all cases, we can not find the right fwmark - * virtual service. If needed, we can do it for - * non-fwmark persistent services. - */ - if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table) - cp->timeout = pp->timeout_table[state]; - else - cp->timeout = (3*60*HZ); - ip_vs_conn_put(cp); - } -} - - -/* - * Setup loopback of outgoing multicasts on a sending socket - */ -static void set_mcast_loop(struct sock *sk, u_char loop) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ - lock_sock(sk); - inet->mc_loop = loop ? 1 : 0; - release_sock(sk); -} - -/* - * Specify TTL for outgoing multicasts on a sending socket - */ -static void set_mcast_ttl(struct sock *sk, u_char ttl) -{ - struct inet_sock *inet = inet_sk(sk); - - /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ - lock_sock(sk); - inet->mc_ttl = ttl; - release_sock(sk); -} - -/* - * Specifiy default interface for outgoing multicasts - */ -static int set_mcast_if(struct sock *sk, char *ifname) -{ - struct net_device *dev; - struct inet_sock *inet = inet_sk(sk); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - lock_sock(sk); - inet->mc_index = dev->ifindex; - /* inet->mc_addr = 0; */ - release_sock(sk); - - return 0; -} - - -/* - * Set the maximum length of sync message according to the - * specified interface's MTU. - */ -static int set_sync_mesg_maxlen(int sync_state) -{ - struct net_device *dev; - int num; - - if (sync_state == IP_VS_STATE_MASTER) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) - return -ENODEV; - - num = (dev->mtu - sizeof(struct iphdr) - - sizeof(struct udphdr) - - SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; - sync_send_mesg_maxlen = - SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; - IP_VS_DBG(7, "setting the maximum length of sync sending " - "message %d.\n", sync_send_mesg_maxlen); - } else if (sync_state == IP_VS_STATE_BACKUP) { - if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) - return -ENODEV; - - sync_recv_mesg_maxlen = dev->mtu - - sizeof(struct iphdr) - sizeof(struct udphdr); - IP_VS_DBG(7, "setting the maximum length of sync receiving " - "message %d.\n", sync_recv_mesg_maxlen); - } - - return 0; -} - - -/* - * Join a multicast group. - * the group is specified by a class D multicast address 224.0.0.0/8 - * in the in_addr structure passed in as a parameter. - */ -static int -join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) -{ - struct ip_mreqn mreq; - struct net_device *dev; - int ret; - - memset(&mreq, 0, sizeof(mreq)); - memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) - return -EINVAL; - - mreq.imr_ifindex = dev->ifindex; - - lock_sock(sk); - ret = ip_mc_join_group(sk, &mreq); - release_sock(sk); - - return ret; -} - - -static int bind_mcastif_addr(struct socket *sock, char *ifname) -{ - struct net_device *dev; - __be32 addr; - struct sockaddr_in sin; - - if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) - return -ENODEV; - - addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); - if (!addr) - IP_VS_ERR("You probably need to specify IP address on " - "multicast interface.\n"); - - IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", - ifname, NIPQUAD(addr)); - - /* Now bind the socket with the address of multicast interface */ - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = addr; - sin.sin_port = 0; - - return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); -} - -/* - * Set up sending multicast socket over UDP - */ -static struct socket * make_send_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error setting outbound mcast interface\n"); - goto error; - } - - set_mcast_loop(sock->sk, 0); - set_mcast_ttl(sock->sk, 1); - - result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error binding address of the mcast interface\n"); - goto error; - } - - result = sock->ops->connect(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr), 0); - if (result < 0) { - IP_VS_ERR("Error connecting to the multicast addr\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -/* - * Set up receiving multicast socket over UDP - */ -static struct socket * make_receive_sock(void) -{ - struct socket *sock; - int result; - - /* First create a socket */ - result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); - if (result < 0) { - IP_VS_ERR("Error during creation of socket; terminating\n"); - return ERR_PTR(result); - } - - /* it is equivalent to the REUSEADDR option in user-space */ - sock->sk->sk_reuse = 1; - - result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr, - sizeof(struct sockaddr)); - if (result < 0) { - IP_VS_ERR("Error binding to the multicast addr\n"); - goto error; - } - - /* join the multicast group */ - result = join_mcast_group(sock->sk, - (struct in_addr *) &mcast_addr.sin_addr, - ip_vs_backup_mcast_ifn); - if (result < 0) { - IP_VS_ERR("Error joining to the multicast group\n"); - goto error; - } - - return sock; - - error: - sock_release(sock); - return ERR_PTR(result); -} - - -static int -ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) -{ - struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; - struct kvec iov; - int len; - - EnterFunction(7); - iov.iov_base = (void *)buffer; - iov.iov_len = length; - - len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); - - LeaveFunction(7); - return len; -} - -static void -ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) -{ - int msize; - - msize = msg->size; - - /* Put size in network byte order */ - msg->size = htons(msg->size); - - if (ip_vs_send_async(sock, (char *)msg, msize) != msize) - IP_VS_ERR("ip_vs_send_async error\n"); -} - -static int -ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) -{ - struct msghdr msg = {NULL,}; - struct kvec iov; - int len; - - EnterFunction(7); - - /* Receive a packet */ - iov.iov_base = buffer; - iov.iov_len = (size_t)buflen; - - len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); - - if (len < 0) - return -1; - - LeaveFunction(7); - return len; -} - - -static int sync_thread_master(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - struct ip_vs_sync_buff *sb; - - IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_master_mcast_ifn, ip_vs_master_syncid); - - while (!kthread_should_stop()) { - while ((sb = sb_dequeue())) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - /* check if entries stay in curr_sb for 2 seconds */ - sb = get_curr_sync_buff(2 * HZ); - if (sb) { - ip_vs_send_sync_msg(tinfo->sock, sb->mesg); - ip_vs_sync_buff_release(sb); - } - - schedule_timeout_interruptible(HZ); - } - - /* clean up the sync_buff queue */ - while ((sb=sb_dequeue())) { - ip_vs_sync_buff_release(sb); - } - - /* clean up the current sync_buff */ - if ((sb = get_curr_sync_buff(0))) { - ip_vs_sync_buff_release(sb); - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo); - - return 0; -} - - -static int sync_thread_backup(void *data) -{ - struct ip_vs_sync_thread_data *tinfo = data; - int len; - - IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " - "syncid = %d\n", - ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); - - while (!kthread_should_stop()) { - wait_event_interruptible(*tinfo->sock->sk->sk_sleep, - !skb_queue_empty(&tinfo->sock->sk->sk_receive_queue) - || kthread_should_stop()); - - /* do we have data now? */ - while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { - len = ip_vs_receive(tinfo->sock, tinfo->buf, - sync_recv_mesg_maxlen); - if (len <= 0) { - IP_VS_ERR("receiving message error\n"); - break; - } - - /* disable bottom half, because it accesses the data - shared by softirq while getting/creating conns */ - local_bh_disable(); - ip_vs_process_message(tinfo->buf, len); - local_bh_enable(); - } - } - - /* release the sending multicast socket */ - sock_release(tinfo->sock); - kfree(tinfo->buf); - kfree(tinfo); - - return 0; -} - - -int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) -{ - struct ip_vs_sync_thread_data *tinfo; - struct task_struct **realtask, *task; - struct socket *sock; - char *name, *buf = NULL; - int (*threadfn)(void *data); - int result = -ENOMEM; - - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", - sizeof(struct ip_vs_sync_conn)); - - if (state == IP_VS_STATE_MASTER) { - if (sync_master_thread) - return -EEXIST; - - strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, - sizeof(ip_vs_master_mcast_ifn)); - ip_vs_master_syncid = syncid; - realtask = &sync_master_thread; - name = "ipvs_syncmaster"; - threadfn = sync_thread_master; - sock = make_send_sock(); - } else if (state == IP_VS_STATE_BACKUP) { - if (sync_backup_thread) - return -EEXIST; - - strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, - sizeof(ip_vs_backup_mcast_ifn)); - ip_vs_backup_syncid = syncid; - realtask = &sync_backup_thread; - name = "ipvs_syncbackup"; - threadfn = sync_thread_backup; - sock = make_receive_sock(); - } else { - return -EINVAL; - } - - if (IS_ERR(sock)) { - result = PTR_ERR(sock); - goto out; - } - - set_sync_mesg_maxlen(state); - if (state == IP_VS_STATE_BACKUP) { - buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); - if (!buf) - goto outsocket; - } - - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); - if (!tinfo) - goto outbuf; - - tinfo->sock = sock; - tinfo->buf = buf; - - task = kthread_run(threadfn, tinfo, name); - if (IS_ERR(task)) { - result = PTR_ERR(task); - goto outtinfo; - } - - /* mark as active */ - *realtask = task; - ip_vs_sync_state |= state; - - /* increase the module use count */ - ip_vs_use_count_inc(); - - return 0; - -outtinfo: - kfree(tinfo); -outbuf: - kfree(buf); -outsocket: - sock_release(sock); -out: - return result; -} - - -int stop_sync_thread(int state) -{ - IP_VS_DBG(7, "%s: pid %d\n", __func__, task_pid_nr(current)); - - if (state == IP_VS_STATE_MASTER) { - if (!sync_master_thread) - return -ESRCH; - - IP_VS_INFO("stopping master sync thread %d ...\n", - task_pid_nr(sync_master_thread)); - - /* - * The lock synchronizes with sb_queue_tail(), so that we don't - * add sync buffers to the queue, when we are already in - * progress of stopping the master sync daemon. - */ - - spin_lock_bh(&ip_vs_sync_lock); - ip_vs_sync_state &= ~IP_VS_STATE_MASTER; - spin_unlock_bh(&ip_vs_sync_lock); - kthread_stop(sync_master_thread); - sync_master_thread = NULL; - } else if (state == IP_VS_STATE_BACKUP) { - if (!sync_backup_thread) - return -ESRCH; - - IP_VS_INFO("stopping backup sync thread %d ...\n", - task_pid_nr(sync_backup_thread)); - - ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(sync_backup_thread); - sync_backup_thread = NULL; - } else { - return -EINVAL; - } - - /* decrease the module use count */ - ip_vs_use_count_dec(); - - return 0; -} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c deleted file mode 100644 index 9b0ef86bb1f7..000000000000 --- a/net/ipv4/ipvs/ip_vs_wlc.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * IPVS: Weighted Least-Connection Scheduling module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Peter Kese <peter.kese@ijs.si> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest - * Wensong Zhang : changed to use the inactconns in scheduling - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wlc_update_svc - * Wensong Zhang : added any dest with weight=0 is quiesced - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> - -#include <net/ip_vs.h> - - -static int -ip_vs_wlc_init_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_done_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static int -ip_vs_wlc_update_svc(struct ip_vs_service *svc) -{ - return 0; -} - - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - -/* - * Weighted Least Connection scheduling - */ -static struct ip_vs_dest * -ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest, *least; - unsigned int loh, doh; - - IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); - - /* - * We calculate the load of each dest server as follows: - * (dest overhead) / dest->weight - * - * Remember -- no floats in kernel mode!!! - * The comparison of h1*w2 > h2*w1 is equivalent to that of - * h1/w1 > h2/w2 - * if every weight is larger than zero. - * - * The server with weight=0 is quiesced and will not receive any - * new connections. - */ - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) > 0) { - least = dest; - loh = ip_vs_wlc_dest_overhead(least); - goto nextstage; - } - } - return NULL; - - /* - * Find the destination with the least load. - */ - nextstage: - list_for_each_entry_continue(dest, &svc->destinations, n_list) { - if (dest->flags & IP_VS_DEST_F_OVERLOAD) - continue; - doh = ip_vs_wlc_dest_overhead(dest); - if (loh * atomic_read(&dest->weight) > - doh * atomic_read(&least->weight)) { - least = dest; - loh = doh; - } - } - - IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d overhead %d\n", - NIPQUAD(least->addr), ntohs(least->port), - atomic_read(&least->activeconns), - atomic_read(&least->refcnt), - atomic_read(&least->weight), loh); - - return least; -} - - -static struct ip_vs_scheduler ip_vs_wlc_scheduler = -{ - .name = "wlc", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wlc_scheduler.n_list), - .init_service = ip_vs_wlc_init_svc, - .done_service = ip_vs_wlc_done_svc, - .update_service = ip_vs_wlc_update_svc, - .schedule = ip_vs_wlc_schedule, -}; - - -static int __init ip_vs_wlc_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -static void __exit ip_vs_wlc_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); -} - -module_init(ip_vs_wlc_init); -module_exit(ip_vs_wlc_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c deleted file mode 100644 index 0d86a79b87b5..000000000000 --- a/net/ipv4/ipvs/ip_vs_wrr.c +++ /dev/null @@ -1,234 +0,0 @@ -/* - * IPVS: Weighted Round-Robin Scheduling module - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest - * Wensong Zhang : changed some comestics things for debugging - * Wensong Zhang : changed for the d-linked destination list - * Wensong Zhang : added the ip_vs_wrr_update_svc - * Julian Anastasov : fixed the bug of returning destination - * with weight 0 when all weights are zero - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/net.h> - -#include <net/ip_vs.h> - -/* - * current destination pointer for weighted round-robin scheduling - */ -struct ip_vs_wrr_mark { - struct list_head *cl; /* current list head */ - int cw; /* current weight */ - int mw; /* maximum weight */ - int di; /* decreasing interval */ -}; - - -/* - * Get the gcd of server weights - */ -static int gcd(int a, int b) -{ - int c; - - while ((c = a % b)) { - a = b; - b = c; - } - return b; -} - -static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight; - int g = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - weight = atomic_read(&dest->weight); - if (weight > 0) { - if (g > 0) - g = gcd(weight, g); - else - g = weight; - } - } - return g ? g : 1; -} - - -/* - * Get the maximum weight of the service destinations. - */ -static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) -{ - struct ip_vs_dest *dest; - int weight = 0; - - list_for_each_entry(dest, &svc->destinations, n_list) { - if (atomic_read(&dest->weight) > weight) - weight = atomic_read(&dest->weight); - } - - return weight; -} - - -static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark; - - /* - * Allocate the mark variable for WRR scheduling - */ - mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); - if (mark == NULL) { - IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); - return -ENOMEM; - } - mark->cl = &svc->destinations; - mark->cw = 0; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - svc->sched_data = mark; - - return 0; -} - - -static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) -{ - /* - * Release the mark variable - */ - kfree(svc->sched_data); - - return 0; -} - - -static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) -{ - struct ip_vs_wrr_mark *mark = svc->sched_data; - - mark->cl = &svc->destinations; - mark->mw = ip_vs_wrr_max_weight(svc); - mark->di = ip_vs_wrr_gcd_weight(svc); - if (mark->cw > mark->mw) - mark->cw = 0; - return 0; -} - - -/* - * Weighted Round-Robin Scheduling - */ -static struct ip_vs_dest * -ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) -{ - struct ip_vs_dest *dest; - struct ip_vs_wrr_mark *mark = svc->sched_data; - struct list_head *p; - - IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); - - /* - * This loop will always terminate, because mark->cw in (0, max_weight] - * and at least one server has its weight equal to max_weight. - */ - write_lock(&svc->sched_lock); - p = mark->cl; - while (1) { - if (mark->cl == &svc->destinations) { - /* it is at the head of the destination list */ - - if (mark->cl == mark->cl->next) { - /* no dest entry */ - dest = NULL; - goto out; - } - - mark->cl = svc->destinations.next; - mark->cw -= mark->di; - if (mark->cw <= 0) { - mark->cw = mark->mw; - /* - * Still zero, which means no available servers. - */ - if (mark->cw == 0) { - mark->cl = &svc->destinations; - IP_VS_ERR_RL("ip_vs_wrr_schedule(): " - "no available servers\n"); - dest = NULL; - goto out; - } - } - } else - mark->cl = mark->cl->next; - - if (mark->cl != &svc->destinations) { - /* not at the head of the list */ - dest = list_entry(mark->cl, struct ip_vs_dest, n_list); - if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && - atomic_read(&dest->weight) >= mark->cw) { - /* got it */ - break; - } - } - - if (mark->cl == p && mark->cw == mark->di) { - /* back to the start, and no dest is found. - It is only possible when all dests are OVERLOADED */ - dest = NULL; - goto out; - } - } - - IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " - "activeconns %d refcnt %d weight %d\n", - NIPQUAD(dest->addr), ntohs(dest->port), - atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), - atomic_read(&dest->weight)); - - out: - write_unlock(&svc->sched_lock); - return dest; -} - - -static struct ip_vs_scheduler ip_vs_wrr_scheduler = { - .name = "wrr", - .refcnt = ATOMIC_INIT(0), - .module = THIS_MODULE, - .n_list = LIST_HEAD_INIT(ip_vs_wrr_scheduler.n_list), - .init_service = ip_vs_wrr_init_svc, - .done_service = ip_vs_wrr_done_svc, - .update_service = ip_vs_wrr_update_svc, - .schedule = ip_vs_wrr_schedule, -}; - -static int __init ip_vs_wrr_init(void) -{ - return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; -} - -static void __exit ip_vs_wrr_cleanup(void) -{ - unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); -} - -module_init(ip_vs_wrr_init); -module_exit(ip_vs_wrr_cleanup); -MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c deleted file mode 100644 index 9892d4aca42e..000000000000 --- a/net/ipv4/ipvs/ip_vs_xmit.c +++ /dev/null @@ -1,559 +0,0 @@ -/* - * ip_vs_xmit.c: various packet transmitters for IPVS - * - * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> - * Julian Anastasov <ja@ssi.bg> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * Changes: - * - */ - -#include <linux/kernel.h> -#include <linux/tcp.h> /* for tcphdr */ -#include <net/ip.h> -#include <net/tcp.h> /* for csum_tcpudp_magic */ -#include <net/udp.h> -#include <net/icmp.h> /* for icmp_send */ -#include <net/route.h> /* for ip_route_output */ -#include <linux/netfilter.h> -#include <linux/netfilter_ipv4.h> - -#include <net/ip_vs.h> - - -/* - * Destination cache to speed up outgoing route lookup - */ -static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = dst; - dest->dst_rtos = rtos; - dst_release(old_dst); -} - -static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) -{ - struct dst_entry *dst = dest->dst_cache; - - if (!dst) - return NULL; - if ((dst->obsolete || rtos != dest->dst_rtos) && - dst->ops->check(dst, cookie) == NULL) { - dest->dst_cache = NULL; - dst_release(dst); - return NULL; - } - dst_hold(dst); - return dst; -} - -static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) -{ - struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; - - if (dest) { - spin_lock(&dest->dst_lock); - if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = dest->addr, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip_route_output error, " - "dest: %u.%u.%u.%u\n", - NIPQUAD(dest->addr)); - return NULL; - } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); - IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", - NIPQUAD(dest->addr), - atomic_read(&rt->u.dst.__refcnt), rtos); - } - spin_unlock(&dest->dst_lock); - } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = cp->daddr, - .saddr = 0, - .tos = rtos, } }, - }; - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_route_output error, dest: " - "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); - return NULL; - } - } - - return rt; -} - - -/* - * Release dest->dst_cache before a dest is removed - */ -void -ip_vs_dst_reset(struct ip_vs_dest *dest) -{ - struct dst_entry *old_dst; - - old_dst = dest->dst_cache; - dest->dst_cache = NULL; - dst_release(old_dst); -} - -#define IP_VS_XMIT(skb, rt) \ -do { \ - (skb)->ipvs_property = 1; \ - skb_forward_csum(skb); \ - NF_HOOK(PF_INET, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->u.dst.dev, dst_output); \ -} while (0) - - -/* - * NULL transmitter (do nothing except return NF_ACCEPT) - */ -int -ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; -} - - -/* - * Bypass transmitter - * Let packets bypass the destination when the destination is not - * available, it may be only used in transparent cache cluster. - */ -int -ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; - int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; - - EnterFunction(10); - - if (ip_route_output_key(&init_net, &rt, &fl)) { - IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " - "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); - goto tx_error_icmp; - } - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - - -/* - * NAT transmitter (only for outside-to-inside nat forwarding) - * Not used for related ICMP - */ -int -ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - struct iphdr *iph = ip_hdr(skb); - - EnterFunction(10); - - /* check if it is a connection of no-client-port */ - if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { - __be16 _pt, *p; - p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); - if (p == NULL) - goto tx_error; - ip_vs_conn_fill_cport(cp, *p); - IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, sizeof(struct iphdr))) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* mangle the packet */ - if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; - ip_hdr(skb)->daddr = cp->daddr; - ip_send_check(ip_hdr(skb)); - - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - - /* FIXME: when application helper enlarges the packet and the length - is larger than the MTU of outgoing device, there will be still - MTU problem. */ - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - LeaveFunction(10); - kfree_skb(skb); - return NF_STOLEN; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} - - -/* - * IP Tunneling transmitter - * - * This function encapsulates the packet in a new IP packet, its - * destination will be set to cp->daddr. Most code of this function - * is taken from ipip.c. - * - * It is used in VS/TUN cluster. The load balancer selects a real - * server from a cluster based on a scheduling algorithm, - * encapsulates the request packet and forwards it to the selected - * server. For example, all real servers are configured with - * "ifconfig tunl0 <Virtual IP Address> up". When the server receives - * the encapsulated packet, it will decapsulate the packet, processe - * the request and return the response packets directly to the client - * without passing the load balancer. This can greatly increase the - * scalability of virtual server. - * - * Used for ANY protocol - */ -int -ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ - struct iphdr *old_iph = ip_hdr(skb); - u8 tos = old_iph->tos; - __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; - struct iphdr *iph; /* Our new IP header */ - unsigned int max_headroom; /* The extra header space needed */ - int mtu; - - EnterFunction(10); - - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) - goto tx_error_icmp; - - tdev = rt->u.dst.dev; - - mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - if (mtu < 68) { - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); - goto tx_error; - } - if (skb->dst) - skb->dst->ops->update_pmtu(skb->dst, mtu); - - df |= (old_iph->frag_off & htons(IP_DF)); - - if ((old_iph->frag_off & htons(IP_DF)) - && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Okay, now see if we can stuff it in the buffer as-is. - */ - max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); - - if (skb_headroom(skb) < max_headroom - || skb_cloned(skb) || skb_shared(skb)) { - struct sk_buff *new_skb = - skb_realloc_headroom(skb, max_headroom); - if (!new_skb) { - ip_rt_put(rt); - kfree_skb(skb); - IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); - return NF_STOLEN; - } - kfree_skb(skb); - skb = new_skb; - old_iph = ip_hdr(skb); - } - - skb->transport_header = old_transport_header; - - /* fix old IP header checksum */ - ip_send_check(old_iph); - - skb_push(skb, sizeof(struct iphdr)); - skb_reset_network_header(skb); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* - * Push down and install the IPIP header. - */ - iph = ip_hdr(skb); - iph->version = 4; - iph->ihl = sizeof(struct iphdr)>>2; - iph->frag_off = df; - iph->protocol = IPPROTO_IPIP; - iph->tos = tos; - iph->daddr = rt->rt_dst; - iph->saddr = rt->rt_src; - iph->ttl = old_iph->ttl; - ip_select_ident(iph, &rt->u.dst, NULL); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - ip_local_out(skb); - - LeaveFunction(10); - - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - - -/* - * Direct Routing transmitter - * Used for ANY protocol - */ -int -ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp) -{ - struct rtable *rt; /* Route to the other host */ - struct iphdr *iph = ip_hdr(skb); - int mtu; - - EnterFunction(10); - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { - icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); - goto tx_error; - } - - /* - * Call ip_send_check because we are not sure it is called - * after ip_defrag. Is copy-on-write needed? - */ - if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { - ip_rt_put(rt); - return NF_STOLEN; - } - ip_send_check(ip_hdr(skb)); - - /* drop old route */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - LeaveFunction(10); - return NF_STOLEN; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - kfree_skb(skb); - LeaveFunction(10); - return NF_STOLEN; -} - - -/* - * ICMP packet transmitter - * called by the ip_vs_in_icmp - */ -int -ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, - struct ip_vs_protocol *pp, int offset) -{ - struct rtable *rt; /* Route to the other host */ - int mtu; - int rc; - - EnterFunction(10); - - /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be - forwarded directly here, because there is no need to - translate address/port back */ - if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { - if (cp->packet_xmit) - rc = cp->packet_xmit(skb, cp, pp); - else - rc = NF_ACCEPT; - /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); - goto out; - } - - /* - * mangle and send the packet here (only for VS/NAT) - */ - - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) - goto tx_error_icmp; - - /* MTU checking */ - mtu = dst_mtu(&rt->u.dst); - if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); - goto tx_error; - } - - /* copy-on-write the packet before mangling it */ - if (!skb_make_writable(skb, offset)) - goto tx_error_put; - - if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) - goto tx_error_put; - - /* drop the old route when skb is not shared */ - dst_release(skb->dst); - skb->dst = &rt->u.dst; - - ip_vs_nat_icmp(skb, pp, cp, 0); - - /* Another hack: avoid icmp_send in ip_fragment */ - skb->local_df = 1; - - IP_VS_XMIT(skb, rt); - - rc = NF_STOLEN; - goto out; - - tx_error_icmp: - dst_link_failure(skb); - tx_error: - dev_kfree_skb(skb); - rc = NF_STOLEN; - out: - LeaveFunction(10); - return rc; - tx_error_put: - ip_rt_put(rt); - goto tx_error; -} |