diff options
35 files changed, 1372 insertions, 405 deletions
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 34551f8aaf9d..3ecc3050be0e 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -31,6 +31,7 @@ struct nfnetlink_subsystem { const struct nfnl_callback *cb; /* callback for individual types */ int (*commit)(struct net *net, struct sk_buff *skb); int (*abort)(struct net *net, struct sk_buff *skb); + void (*cleanup)(struct net *net); bool (*valid_genid)(struct net *net, u32 genid); }; diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 0ac795b41ab8..03f567eb9536 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -763,14 +763,14 @@ struct ip_vs_app { * 2=Mangled but checksum was not updated */ int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, - struct sk_buff *, int *diff); + struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* input hook: Process packet in outin direction, diff set for TCP. * Return: 0=Error, 1=Payload Not Mangled/Mangled but checksum is ok, * 2=Mangled but checksum was not updated */ int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, - struct sk_buff *, int *diff); + struct sk_buff *, int *diff, struct ip_vs_iphdr *ipvsh); /* ip_vs_app initializer */ int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); @@ -1328,8 +1328,10 @@ int register_ip_vs_app_inc(struct netns_ipvs *ipvs, struct ip_vs_app *app, __u16 int ip_vs_app_inc_get(struct ip_vs_app *inc); void ip_vs_app_inc_put(struct ip_vs_app *inc); -int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb); -int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); +int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh); +int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh); int register_ip_vs_pe(struct ip_vs_pe *pe); int unregister_ip_vs_pe(struct ip_vs_pe *pe); diff --git a/include/net/netfilter/nf_socket.h b/include/net/netfilter/nf_socket.h index 8230fefff9f5..29b6313f0557 100644 --- a/include/net/netfilter/nf_socket.h +++ b/include/net/netfilter/nf_socket.h @@ -2,10 +2,8 @@ #ifndef _NF_SOCK_H_ #define _NF_SOCK_H_ -struct net_device; -struct sk_buff; -struct sock; -struct net; +#include <net/sock.h> +#include <net/inet_timewait_sock.h> static inline bool nf_sk_is_transparent(struct sock *sk) { diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 603b51401deb..435c32d8a995 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -858,6 +858,8 @@ enum nft_chain_flags { * @name: name of the chain */ struct nft_chain { + struct nft_rule *__rcu *rules_gen_0; + struct nft_rule *__rcu *rules_gen_1; struct list_head rules; struct list_head list; struct nft_table *table; @@ -867,8 +869,13 @@ struct nft_chain { u8 flags:6, genmask:2; char *name; + + /* Only used during control plane commit phase: */ + struct nft_rule **rules_next; }; +int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain); + enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index cd6915b6c054..e0c0c2558ec4 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -2,6 +2,8 @@ #ifndef _NET_NF_TABLES_CORE_H #define _NET_NF_TABLES_CORE_H +#include <net/netfilter/nf_tables.h> + extern struct nft_expr_type nft_imm_type; extern struct nft_expr_type nft_cmp_type; extern struct nft_expr_type nft_lookup_type; @@ -23,6 +25,12 @@ struct nft_cmp_fast_expr { u8 len; }; +struct nft_immediate_expr { + struct nft_data data; + enum nft_registers dreg:8; + u8 dlen; +}; + /* Calculate the mask for the nft_cmp_fast expression. On big endian the * mask needs to include the *upper* bytes when interpreting that data as * something smaller than the full u32, therefore a cpu_to_le32 is done. diff --git a/include/net/netns/nftables.h b/include/net/netns/nftables.h index 29c3851b486a..94767ea3a490 100644 --- a/include/net/netns/nftables.h +++ b/include/net/netns/nftables.h @@ -9,6 +9,7 @@ struct netns_nftables { struct list_head commit_list; unsigned int base_seq; u8 gencursor; + u8 validate_state; }; #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 9c71f024f9cc..a089af092a29 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -905,6 +905,31 @@ enum nft_rt_attributes { #define NFTA_RT_MAX (__NFTA_RT_MAX - 1) /** + * enum nft_socket_attributes - nf_tables socket expression netlink attributes + * + * @NFTA_SOCKET_KEY: socket key to match + * @NFTA_SOCKET_DREG: destination register + */ +enum nft_socket_attributes { + NFTA_SOCKET_UNSPEC, + NFTA_SOCKET_KEY, + NFTA_SOCKET_DREG, + __NFTA_SOCKET_MAX +}; +#define NFTA_SOCKET_MAX (__NFTA_SOCKET_MAX - 1) + +/* + * enum nft_socket_keys - nf_tables socket expression keys + * + * @NFT_SOCKET_TRANSPARENT: Value of the IP(V6)_TRANSPARENT socket option_ + */ +enum nft_socket_keys { + NFT_SOCKET_TRANSPARENT, + __NFT_SOCKET_MAX +}; +#define NFT_SOCKET_MAX (__NFT_SOCKET_MAX - 1) + +/** * enum nft_ct_keys - nf_tables ct expression keys * * @NFT_CT_STATE: conntrack state (bitmask of enum ip_conntrack_info) @@ -1056,6 +1081,11 @@ enum nft_log_attributes { #define NFTA_LOG_MAX (__NFTA_LOG_MAX - 1) /** + * LOGLEVEL_AUDIT - a pseudo log level enabling audit logging + */ +#define LOGLEVEL_AUDIT 8 + +/** * enum nft_queue_attributes - nf_tables queue expression netlink attributes * * @NFTA_QUEUE_NUM: netlink queue to send messages to (NLA_U16) @@ -1230,10 +1260,14 @@ enum nft_dup_attributes { * enum nft_fwd_attributes - nf_tables fwd expression netlink attributes * * @NFTA_FWD_SREG_DEV: source register of output interface (NLA_U32: nft_register) + * @NFTA_FWD_SREG_ADDR: source register of destination address (NLA_U32: nft_register) + * @NFTA_FWD_NFPROTO: layer 3 family of source register address (NLA_U32: enum nfproto) */ enum nft_fwd_attributes { NFTA_FWD_UNSPEC, NFTA_FWD_SREG_DEV, + NFTA_FWD_SREG_ADDR, + NFTA_FWD_NFPROTO, __NFTA_FWD_MAX }; #define NFTA_FWD_MAX (__NFTA_FWD_MAX - 1) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 280048e1e395..d03bc5a01a70 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -129,10 +129,7 @@ config NFT_CHAIN_NAT_IPV4 source and destination ports. config NF_NAT_MASQUERADE_IPV4 - tristate "IPv4 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection). + bool config NFT_MASQ_IPV4 tristate "IPv4 masquerading support for nf_tables" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 0e5edd0c7926..c4b05b174091 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -10,6 +10,7 @@ nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o +nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o # defrag @@ -32,9 +33,6 @@ nf_nat_snmp_basic-y := nf_nat_snmp_basic.asn1.o nf_nat_snmp_basic_main.o $(obj)/nf_nat_snmp_basic_main.o: $(obj)/nf_nat_snmp_basic.asn1.h obj-$(CONFIG_NF_NAT_SNMP_BASIC) += nf_nat_snmp_basic.o -obj-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o - - # NAT protocols (nf_nat) obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index f538c5001547..ad3aeff152ed 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -7,7 +7,6 @@ */ #include <linux/types.h> -#include <linux/module.h> #include <linux/atomic.h> #include <linux/inetdevice.h> #include <linux/ip.h> @@ -157,6 +156,3 @@ void nf_nat_masquerade_ipv4_unregister_notifier(void) unregister_inetaddr_notifier(&masq_inet_notifier); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv4_unregister_notifier); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index ce77bcc2490c..9f5b00a39adf 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -136,10 +136,7 @@ config NF_NAT_IPV6 if NF_NAT_IPV6 config NF_NAT_MASQUERADE_IPV6 - tristate "IPv6 masquerade support" - help - This is the kernel functionality to provide NAT in the masquerade - flavour (automatic source address selection) for IPv6. + bool endif # NF_NAT_IPV6 diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index 44273d6f03a5..71518f22ae39 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -18,8 +18,8 @@ nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o +nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o -obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o # defrag nf_defrag_ipv6-y := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 9dfc2b90c362..e6eb7cf9b54f 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -10,7 +10,6 @@ */ #include <linux/kernel.h> -#include <linux/module.h> #include <linux/atomic.h> #include <linux/netdevice.h> #include <linux/ipv6.h> @@ -186,6 +185,3 @@ void nf_nat_masquerade_ipv6_unregister_notifier(void) unregister_netdevice_notifier(&masq_dev_notifier); } EXPORT_SYMBOL_GPL(nf_nat_masquerade_ipv6_unregister_notifier); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index a5b60e6a983e..276e1e32f44e 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -433,11 +433,7 @@ config NF_NAT_TFTP default NF_NAT && NF_CONNTRACK_TFTP config NF_NAT_REDIRECT - tristate "IPv4/IPv6 redirect support" - depends on NF_NAT - help - This is the kernel functionality to redirect packets to local - machine through NAT. + bool config NETFILTER_SYNPROXY tristate @@ -617,6 +613,15 @@ config NFT_FIB_INET The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_SOCKET + tristate "Netfilter nf_tables socket match support" + depends on IPV6 || IPV6=n + select NF_SOCKET_IPV4 + select NF_SOCKET_IPV6 if IPV6 + help + This option allows matching for the presence or absence of a + corresponding socket and its attributes. + if NF_TABLES_NETDEV config NF_DUP_NETDEV diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 1aa710b5d384..eec169555731 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -55,7 +55,7 @@ obj-$(CONFIG_NF_LOG_COMMON) += nf_log_common.o obj-$(CONFIG_NF_LOG_NETDEV) += nf_log_netdev.o obj-$(CONFIG_NF_NAT) += nf_nat.o -obj-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o +nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o # NAT helpers obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o @@ -102,6 +102,7 @@ obj-$(CONFIG_NFT_FIB) += nft_fib.o obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NF_OSF) += nf_osf.o +obj-$(CONFIG_NFT_SOCKET) += nft_socket.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 1c98c907bc63..12d74896556a 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, } static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); @@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, if (app->pkt_out == NULL) return 1; - if (!app->pkt_out(app, cp, skb, &diff)) + if (!app->pkt_out(app, cp, skb, &diff, ipvsh)) return 0; /* @@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, * called by ipvs packet handler, assumes previously checked cp!=NULL * returns false if it can't handle packet (oom) */ -int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; @@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_out(cp, skb, app); + return app_tcp_pkt_out(cp, skb, app, ipvsh); /* * Call private output hook function @@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) if (app->pkt_out == NULL) return 1; - return app->pkt_out(app, cp, skb, NULL); + return app->pkt_out(app, cp, skb, NULL, ipvsh); } static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, - struct ip_vs_app *app) + struct ip_vs_app *app, + struct ip_vs_iphdr *ipvsh) { int diff; const unsigned int tcp_offset = ip_hdrlen(skb); @@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, if (app->pkt_in == NULL) return 1; - if (!app->pkt_in(app, cp, skb, &diff)) + if (!app->pkt_in(app, cp, skb, &diff, ipvsh)) return 0; /* @@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, * called by ipvs packet handler, assumes previously checked cp!=NULL. * returns false if can't handle packet (oom). */ -int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb, + struct ip_vs_iphdr *ipvsh) { struct ip_vs_app *app; @@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) /* TCP is complicated */ if (cp->protocol == IPPROTO_TCP) - return app_tcp_pkt_in(cp, skb, app); + return app_tcp_pkt_in(cp, skb, app, ipvsh); /* * Call private input hook function @@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) if (app->pkt_in == NULL) return 1; - return app->pkt_in(app, cp, skb, NULL); + return app->pkt_in(app, cp, skb, NULL, ipvsh); } diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 58d5d05aec24..4398a72edec5 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -29,6 +29,8 @@ #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/skbuff.h> +#include <linux/ctype.h> +#include <linux/inet.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/netfilter.h> @@ -44,9 +46,18 @@ #include <net/ip_vs.h> -#define SERVER_STRING "227 " -#define CLIENT_STRING "PORT" +#define SERVER_STRING_PASV "227 " +#define CLIENT_STRING_PORT "PORT" +#define SERVER_STRING_EPSV "229 " +#define CLIENT_STRING_EPRT "EPRT" +enum { + IP_VS_FTP_ACTIVE = 0, + IP_VS_FTP_PORT = 0, + IP_VS_FTP_PASV, + IP_VS_FTP_EPRT, + IP_VS_FTP_EPSV, +}; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444); MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands"); -/* Dummy variable */ -static int ip_vs_ftp_pasv; +static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh) +{ + struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len); + + if ((th->doff << 2) < sizeof(struct tcphdr)) + return NULL; + return (char *)th + (th->doff << 2); +} static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) @@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) } -/* - * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started - * with the "pattern", ignoring before "skip" and terminated with - * the "term" character. - * <addr,port> is in network order. +/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern". <addr,port> is in network order. + * Parse extended format depending on ext. In this case addr can be pre-set. */ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, const char *pattern, size_t plen, - char skip, char term, - __be32 *addr, __be16 *port, - char **start, char **end) + char skip, bool ext, int mode, + union nf_inet_addr *addr, __be16 *port, + __u16 af, char **start, char **end) { char *s, c; unsigned char p[6]; + char edelim; + __u16 hport; int i = 0; if (data_limit - data < plen) { @@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, if (s == data_limit) return -1; if (!found) { + /* "(" is optional for non-extended format, + * so catch the start of IPv4 address + */ + if (!ext && isdigit(*s)) + break; if (*s == skip) found = 1; } else if (*s != skip) { @@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, } } } + /* Old IPv4-only format? */ + if (!ext) { + p[0] = 0; + for (data = s; ; data++) { + if (data == data_limit) + return -1; + c = *data; + if (isdigit(c)) { + p[i] = p[i]*10 + c - '0'; + } else if (c == ',' && i < 5) { + i++; + p[i] = 0; + } else { + /* unexpected character or terminator */ + break; + } + } - for (data = s; ; data++) { - if (data == data_limit) + if (i != 5) return -1; - if (*data == term) - break; + + *start = s; + *end = data; + addr->ip = get_unaligned((__be32 *) p); + *port = get_unaligned((__be16 *) (p + 4)); + return 1; } - *end = data; + if (s == data_limit) + return -1; + *start = s; + edelim = *s++; + if (edelim < 33 || edelim > 126) + return -1; + if (s == data_limit) + return -1; + if (*s == edelim) { + /* Address family is usually missing for EPSV response */ + if (mode != IP_VS_FTP_EPSV) + return -1; + s++; + if (s == data_limit) + return -1; + /* Then address should be missing too */ + if (*s != edelim) + return -1; + /* Caller can pre-set addr, if needed */ + s++; + } else { + const char *ep; - memset(p, 0, sizeof(p)); - for (data = s; ; data++) { - c = *data; - if (c == term) - break; - if (c >= '0' && c <= '9') { - p[i] = p[i]*10 + c - '0'; - } else if (c == ',' && i < 5) { - i++; - } else { - /* unexpected character */ + /* We allow address only from same family */ + if (af == AF_INET6 && *s != '2') return -1; + if (af == AF_INET && *s != '1') + return -1; + s++; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; + if (s == data_limit) + return -1; + if (af == AF_INET6) { + if (in6_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; + } else { + if (in4_pton(s, data_limit - s, (u8 *)addr, edelim, + &ep) <= 0) + return -1; } + s = (char *) ep; + if (s == data_limit) + return -1; + if (*s != edelim) + return -1; + s++; } - - if (i != 5) + for (hport = 0; ; s++) + { + if (s == data_limit) + return -1; + if (!isdigit(*s)) + break; + hport = hport * 10 + *s - '0'; + } + if (s == data_limit || !hport || *s != edelim) return -1; - - *start = s; - *addr = get_unaligned((__be32 *) p); - *port = get_unaligned((__be16 *) (p + 4)); + s++; + *end = s; + *port = htons(hport); return 1; } -/* - * Look at outgoing ftp packets to catch the response to a PASV command +/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, * client port 0 (unknown at the moment), the server address and the @@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, * The outgoing packet should be something like * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + * The extended format for EPSV response provides usually only port: + * "229 Entering Extended Passive Mode (|||ppp|)" */ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) { - struct iphdr *iph; - struct tcphdr *th; char *data, *data_limit; char *start, *end; union nf_inet_addr from; @@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, *diff = 0; -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; @@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!skb_make_writable(skb, skb->len)) return 0; - if (cp->app_data == &ip_vs_ftp_pasv) { - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - data = (char *)th + (th->doff << 2); + if (cp->app_data == (void *) IP_VS_FTP_PASV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; + if (ip_vs_ftp_get_addrport(data, data_limit, - SERVER_STRING, - sizeof(SERVER_STRING)-1, - '(', ')', - &from.ip, &port, + SERVER_STRING_PASV, + sizeof(SERVER_STRING_PASV)-1, + '(', false, IP_VS_FTP_PASV, + &from, &port, cp->af, &start, &end) != 1) return 1; - IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n", + IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n", &from.ip, ntohs(port), &cp->caddr.ip, 0); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + data = ip_vs_ftp_data_ptr(skb, ipvsh); + data_limit = skb_tail_pointer(skb); - /* - * Now update or create an connection entry for it + if (!data || data >= data_limit) + return 1; + + /* Usually, data address is not specified but + * we support different address, so pre-set it. */ - { - struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, AF_INET, - iph->protocol, &from, port, - &cp->caddr, 0, &p); - n_cp = ip_vs_conn_out_get(&p); - } - if (!n_cp) { - struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, - AF_INET, IPPROTO_TCP, &cp->caddr, - 0, &cp->vaddr, port, &p); - /* As above, this is ipv4 only */ - n_cp = ip_vs_conn_new(&p, AF_INET, &from, port, - IP_VS_CONN_F_NO_CPORT | - IP_VS_CONN_F_NFCT, - cp->dest, skb->mark); - if (!n_cp) - return 0; + from = cp->daddr; + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING_EPSV, + sizeof(SERVER_STRING_EPSV)-1, + '(', true, IP_VS_FTP_EPSV, + &from, &port, cp->af, + &start, &end) != 1) + return 1; - /* add its controller */ - ip_vs_control_add(n_cp, cp); - } + IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &from), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0); + } else { + return 1; + } - /* - * Replace the old passive address with the new one - */ + /* Now update or create a connection entry for it */ + { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &from, port, + &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } + if (!n_cp) { + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->ipvs, + cp->af, ipvsh->protocol, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, cp->af, &from, port, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, + cp->dest, skb->mark); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* Replace the old passive address with the new one */ + if (cp->app_data == (void *) IP_VS_FTP_PASV) { from.ip = n_cp->vaddr.ip; port = n_cp->vport; snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", @@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ((unsigned char *)&from.ip)[3], ntohs(port) >> 8, ntohs(port) & 0xFF); + } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) { + from = n_cp->vaddr; + port = n_cp->vport; + /* Only port, client will use VIP for the data connection */ + snprintf(buf, sizeof(buf), "|||%u|", + ntohs(port)); + } else { + *buf = 0; + } + buf_len = strlen(buf); - buf_len = strlen(buf); - - ct = nf_ct_get(skb, &ctinfo); - if (ct) { - bool mangled; - - /* If mangling fails this function will return 0 - * which will cause the packet to be dropped. - * Mangling can only fail under memory pressure, - * hopefully it will succeed on the retransmitted - * packet. - */ - mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, - iph->ihl * 4, - start - data, - end - start, - buf, buf_len); - if (mangled) { - ip_vs_nfct_expect_related(skb, ct, n_cp, - IPPROTO_TCP, 0, 0); - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_UNNECESSARY; - /* csum is updated */ - ret = 1; - } - } + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + bool mangled; - /* - * Not setting 'diff' is intentional, otherwise the sequence - * would be adjusted twice. + /* If mangling fails this function will return 0 + * which will cause the packet to be dropped. + * Mangling can only fail under memory pressure, + * hopefully it will succeed on the retransmitted + * packet. */ - - cp->app_data = NULL; - ip_vs_tcp_conn_listen(n_cp); - ip_vs_conn_put(n_cp); - return ret; + mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + ipvsh->len, + start - data, + end - start, + buf, buf_len); + if (mangled) { + ip_vs_nfct_expect_related(skb, ct, n_cp, + ipvsh->protocol, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } } - return 1; + + /* Not setting 'diff' is intentional, otherwise the sequence + * would be adjusted twice. + */ + + cp->app_data = (void *) IP_VS_FTP_ACTIVE; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; } -/* - * Look at incoming ftp packets to catch the PASV/PORT command +/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command * (outside-to-inside). * * The incoming packet having the PORT command should be something like @@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * In this case, we create a connection entry using the client address and * port, so that the active ftp data connection from the server can reach * the client. + * Extended format: + * "EPSV\r\n" when client requests server address from same family + * "EPSV 1\r\n" when client requests IPv4 server address + * "EPSV 2\r\n" when client requests IPv6 server address + * "EPSV ALL\r\n" - not supported + * EPRT with specified delimiter (ASCII 33..126), "|" by default: + * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport + * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport */ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, - struct sk_buff *skb, int *diff) + struct sk_buff *skb, int *diff, + struct ip_vs_iphdr *ipvsh) { - struct iphdr *iph; - struct tcphdr *th; char *data, *data_start, *data_limit; char *start, *end; union nf_inet_addr to; @@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, /* no diff required for incoming packets */ *diff = 0; -#ifdef CONFIG_IP_VS_IPV6 - /* This application helper doesn't work with IPv6 yet, - * so turn this into a no-op for IPv6 packets - */ - if (cp->af == AF_INET6) - return 1; -#endif - /* Only useful for established sessions */ if (cp->state != IP_VS_TCP_S_ESTABLISHED) return 1; @@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, if (!skb_make_writable(skb, skb->len)) return 0; - /* - * Detecting whether it is passive - */ - iph = ip_hdr(skb); - th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); - - /* Since there may be OPTIONS in the TCP packet and the HLEN is - the length of the header in 32-bit multiples, it is accurate - to calculate data address by th+HLEN*4 */ - data = data_start = (char *)th + (th->doff << 2); + data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh); data_limit = skb_tail_pointer(skb); + if (!data || data >= data_limit) + return 1; while (data <= data_limit - 6) { - if (strncasecmp(data, "PASV\r\n", 6) == 0) { + if (cp->af == AF_INET && + strncasecmp(data, "PASV\r\n", 6) == 0) { /* Passive mode on */ IP_VS_DBG(7, "got PASV at %td of %td\n", data - data_start, data_limit - data_start); - cp->app_data = &ip_vs_ftp_pasv; + cp->app_data = (void *) IP_VS_FTP_PASV; return 1; } + + /* EPSV or EPSV<space><net-prt> */ + if (strncasecmp(data, "EPSV", 4) == 0 && + (data[4] == ' ' || data[4] == '\r')) { + if (data[4] == ' ') { + char proto = data[5]; + + if (data > data_limit - 7 || data[6] != '\r') + return 1; + +#ifdef CONFIG_IP_VS_IPV6 + if (cp->af == AF_INET6 && proto == '2') { + } else +#endif + if (cp->af == AF_INET && proto == '1') { + } else { + return 1; + } + } + /* Extended Passive mode on */ + IP_VS_DBG(7, "got EPSV at %td of %td\n", + data - data_start, + data_limit - data_start); + cp->app_data = (void *) IP_VS_FTP_EPSV; + return 1; + } + data++; } @@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, * then create a new connection entry for the coming data * connection. */ - if (ip_vs_ftp_get_addrport(data_start, data_limit, - CLIENT_STRING, sizeof(CLIENT_STRING)-1, - ' ', '\r', &to.ip, &port, - &start, &end) != 1) + if (cp->af == AF_INET && + ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_PORT, + sizeof(CLIENT_STRING_PORT)-1, + ' ', false, IP_VS_FTP_PORT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n", + ip_vs_proto_name(ipvsh->protocol), + &to.ip, ntohs(port), &cp->vaddr.ip, + ntohs(cp->vport)-1); + } else if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING_EPRT, + sizeof(CLIENT_STRING_EPRT)-1, + ' ', true, IP_VS_FTP_EPRT, + &to, &port, cp->af, + &start, &end) == 1) { + + IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n", + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port)); + + /* Now update or create a connection entry for it */ + IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n", + ip_vs_proto_name(ipvsh->protocol), + IP_VS_DBG_ADDR(cp->af, &to), ntohs(port), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), + ntohs(cp->vport)-1); + } else { return 1; - - IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port)); + } /* Passive mode off */ - cp->app_data = NULL; - - /* - * Now update or create a connection entry for it - */ - IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n", - ip_vs_proto_name(iph->protocol), - &to.ip, ntohs(port), &cp->vaddr.ip, 0); + cp->app_data = (void *) IP_VS_FTP_ACTIVE; { struct ip_vs_conn_param p; - ip_vs_conn_fill_param(cp->ipvs, AF_INET, - iph->protocol, &to, port, &cp->vaddr, + ip_vs_conn_fill_param(cp->ipvs, cp->af, + ipvsh->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &p); n_cp = ip_vs_conn_in_get(&p); if (!n_cp) { - /* This is ipv4 only */ - n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr, + n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr, htons(ntohs(cp->dport)-1), IP_VS_CONN_F_NFCT, cp->dest, skb->mark); @@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net) ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]); if (ret) goto err_unreg; - pr_info("%s: loaded support on port[%d] = %d\n", + pr_info("%s: loaded support on port[%d] = %u\n", app->name, i, ports[i]); } return 0; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 6cf3fd81a5ec..eb8b9c883889 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -67,15 +67,20 @@ #include <net/netfilter/nf_conntrack_zones.h> -#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" -#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ - &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ +#define FMT_TUPLE "%s:%u->%s:%u/%u" +#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \ + ntohs((T)->src.u.all), \ + IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \ + ntohs((T)->dst.u.all), \ (T)->dst.protonum -#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" -#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ - &((C)->vaddr.ip), ntohs((C)->vport), \ - &((C)->daddr.ip), ntohs((C)->dport), \ +#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u" +#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \ + ntohs((C)->cport), \ + IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \ + ntohs((C)->vport), \ + IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \ + ntohs((C)->dport), \ (C)->protocol, (C)->state void @@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) new_tuple.dst.protonum != IPPROTO_ICMPV6) new_tuple.dst.u.tcp.port = cp->vport; } - IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " - "ctinfo=%d, old reply=" FMT_TUPLE - ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", - __func__, ct, ct->status, ctinfo, - ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), - ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)); + IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, new reply=" FMT_TUPLE "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&new_tuple)); nf_conntrack_alter_reply(ct, &new_tuple); + IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n", + __func__, ct, ARG_CONN(cp)); } int ip_vs_confirm_conntrack(struct sk_buff *skb) @@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, struct ip_vs_conn_param p; struct net *net = nf_ct_net(ct); - if (exp->tuple.src.l3num != PF_INET) - return; - /* * We assume that no NF locks are held before this callback. * ip_vs_conn_out_get and ip_vs_conn_in_get should match their @@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, cp = ip_vs_conn_out_get(&p); if (cp) { /* Change reply CLIENT->RS to CLIENT->VS */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found inout cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); new_reply.dst.u3 = cp->vaddr; new_reply.dst.u.tcp.port = cp->vport; - IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE - ", inout cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); goto alter; } @@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, cp = ip_vs_conn_in_get(&p); if (cp) { /* Change reply VS->CLIENT to RS->CLIENT */ + IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp=" + FMT_CONN "\n", + __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found outin cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&new_reply)); new_reply.src.u3 = cp->daddr; new_reply.src.u.tcp.port = cp->dport; - IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " - FMT_TUPLE ", outin cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); goto alter; } - IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE - " - unknown expect\n", - __func__, ct, ct->status, ARG_TUPLE(orig)); + IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); return; alter: @@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, exp->expectfn = ip_vs_nfct_expect_callback; - IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&exp->tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); nf_ct_expect_related(exp); nf_ct_expect_put(exp); } @@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) tuple.dst.u3 = cp->vaddr; tuple.dst.u.all = cp->vport; - IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE - " for conn " FMT_CONN "\n", - __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n", + __func__, ARG_CONN(cp)); h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_kill(ct)) { - IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple=" - FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); } else { - IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" - FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); } nf_ct_put(ct); } else { - IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", - __func__, ARG_TUPLE(&tuple)); + IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); } } diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index eff7569824e5..3250c4a1111e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - ret = ip_vs_app_pkt_out(cp, skb); + ret = ip_vs_app_pkt_out(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ @@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - ret = ip_vs_app_pkt_in(cp, skb); + ret = ip_vs_app_pkt_in(cp, skb, iph); if (ret == 0) return 0; /* ret=2: csum update is needed after payload mangling */ diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 569631d2b2a1..80d10ad12a15 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; /* Call application helper if needed */ - if (!(ret = ip_vs_app_pkt_out(cp, skb))) + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!(ret = ip_vs_app_pkt_in(cp, skb))) + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index c15ef7c2a1fa..e0ef11c3691e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, /* * Call application helper if needed */ - if (!(ret = ip_vs_app_pkt_out(cp, skb))) + if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) @@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!(ret = ip_vs_app_pkt_in(cp, skb))) + if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 82451b7e0acb..15ed91309992 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -220,7 +220,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, enum flow_offload_tuple_dir dir; struct flow_offload *flow; struct net_device *outdev; - const struct rtable *rt; + struct rtable *rt; unsigned int thoff; struct iphdr *iph; __be32 nexthop; @@ -241,7 +241,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); - rt = (const struct rtable *)flow->tuplehash[dir].tuple.dst_cache; + rt = (struct rtable *)flow->tuplehash[dir].tuple.dst_cache; if (unlikely(nf_flow_exceeds_mtu(skb, flow->tuplehash[dir].tuple.mtu)) && (ip_hdr(skb)->frag_off & htons(IP_DF)) != 0) @@ -264,6 +264,7 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + skb_dst_set_noref(skb, &rt->dst); neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); return NF_STOLEN; @@ -480,6 +481,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + skb_dst_set_noref(skb, &rt->dst); neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); return NF_STOLEN; diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 821f8d835f7a..b7df32a56e7e 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -1036,7 +1036,7 @@ static struct pernet_operations nat_net_ops = { .size = sizeof(struct nat_net), }; -struct nf_nat_hook nat_hook = { +static struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 7c4bb0a773ca..adee04af8d43 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -15,7 +15,6 @@ #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/kernel.h> -#include <linux/module.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/types.h> @@ -124,6 +123,3 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); } EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 87b2a77add65..c785bc5a66f1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -28,6 +28,28 @@ static LIST_HEAD(nf_tables_objects); static LIST_HEAD(nf_tables_flowtables); static u64 table_handle; +enum { + NFT_VALIDATE_SKIP = 0, + NFT_VALIDATE_NEED, + NFT_VALIDATE_DO, +}; + +static void nft_validate_state_update(struct net *net, u8 new_validate_state) +{ + switch (net->nft.validate_state) { + case NFT_VALIDATE_SKIP: + WARN_ON_ONCE(new_validate_state == NFT_VALIDATE_DO); + break; + case NFT_VALIDATE_NEED: + break; + case NFT_VALIDATE_DO: + if (new_validate_state == NFT_VALIDATE_NEED) + return; + } + + net->nft.validate_state = new_validate_state; +} + static void nft_ctx_init(struct nft_ctx *ctx, struct net *net, const struct sk_buff *skb, @@ -373,7 +395,7 @@ static struct nft_table *nft_table_lookup(const struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry_rcu(table, &net->nft.tables, list) { if (!nla_strcmp(nla, table->name) && table->family == family && nft_active_genmask(table, genmask)) @@ -546,6 +568,24 @@ done: return skb->len; } +static int nft_netlink_dump_start_rcu(struct sock *nlsk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *c) +{ + int err; + + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); + err = netlink_dump_start(nlsk, skb, nlh, c); + rcu_read_lock(); + module_put(THIS_MODULE); + + return err; +} + +/* called with rcu_read_lock held */ static int nf_tables_gettable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -561,8 +601,10 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_tables, + .module = THIS_MODULE, }; - return netlink_dump_start(nlsk, skb, nlh, &c); + + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } table = nft_table_lookup(net, nla[NFTA_TABLE_NAME], family, genmask); @@ -571,7 +613,7 @@ static int nf_tables_gettable(struct net *net, struct sock *nlsk, return PTR_ERR(table); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -933,7 +975,7 @@ static struct nft_chain *nft_chain_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry(chain, &table->chains, list) { + list_for_each_entry_rcu(chain, &table->chains, list) { if (!nla_strcmp(nla, chain->name) && nft_active_genmask(chain, genmask)) return chain; @@ -1135,6 +1177,7 @@ done: return skb->len; } +/* called with rcu_read_lock held */ static int nf_tables_getchain(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -1151,8 +1194,10 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, if (nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = nf_tables_dump_chains, + .module = THIS_MODULE, }; - return netlink_dump_start(nlsk, skb, nlh, &c); + + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask); @@ -1167,7 +1212,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk, return PTR_ERR(chain); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -1237,12 +1282,29 @@ static void nft_chain_stats_replace(struct nft_base_chain *chain, rcu_assign_pointer(chain->stats, newstats); } +static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) +{ + struct nft_rule **g0 = rcu_dereference_raw(chain->rules_gen_0); + struct nft_rule **g1 = rcu_dereference_raw(chain->rules_gen_1); + + if (g0 != g1) + kvfree(g1); + kvfree(g0); + + /* should be NULL either via abort or via successful commit */ + WARN_ON_ONCE(chain->rules_next); + kvfree(chain->rules_next); +} + static void nf_tables_chain_destroy(struct nft_ctx *ctx) { struct nft_chain *chain = ctx->chain; BUG_ON(chain->use > 0); + /* no concurrent access possible anymore */ + nf_tables_chain_free_chain_rules(chain); + if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); @@ -1335,6 +1397,27 @@ static void nft_chain_release_hook(struct nft_chain_hook *hook) module_put(hook->type->owner); } +struct nft_rules_old { + struct rcu_head h; + struct nft_rule **start; +}; + +static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *chain, + unsigned int alloc) +{ + if (alloc > INT_MAX) + return NULL; + + alloc += 1; /* NULL, ends rules */ + if (sizeof(struct nft_rule *) > INT_MAX / alloc) + return NULL; + + alloc *= sizeof(struct nft_rule *); + alloc += sizeof(struct nft_rules_old); + + return kvmalloc(alloc, GFP_KERNEL); +} + static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, u8 policy, bool create) { @@ -1344,6 +1427,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct nft_stats __percpu *stats; struct net *net = ctx->net; struct nft_chain *chain; + struct nft_rule **rules; int err; if (table->use == UINT_MAX) @@ -1406,6 +1490,16 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err1; } + rules = nf_tables_chain_alloc_rules(chain, 0); + if (!rules) { + err = -ENOMEM; + goto err1; + } + + *rules = NULL; + rcu_assign_pointer(chain->rules_gen_0, rules); + rcu_assign_pointer(chain->rules_gen_1, rules); + err = nf_tables_register_hook(net, table, chain); if (err < 0) goto err1; @@ -1849,19 +1943,7 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx, goto err1; } - if (ops->validate) { - const struct nft_data *data = NULL; - - err = ops->validate(ctx, expr, &data); - if (err < 0) - goto err2; - } - return 0; - -err2: - if (ops->destroy) - ops->destroy(ctx, expr); err1: expr->ops = NULL; return err; @@ -1920,7 +2002,7 @@ static struct nft_rule *__nft_rule_lookup(const struct nft_chain *chain, struct nft_rule *rule; // FIXME: this sucks - list_for_each_entry(rule, &chain->rules, list) { + list_for_each_entry_rcu(rule, &chain->rules, list) { if (handle == rule->handle) return rule; } @@ -2116,6 +2198,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) return 0; } +/* called with rcu_read_lock held */ static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2134,18 +2217,19 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_rules, .done = nf_tables_dump_rules_done, + .module = THIS_MODULE, }; if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) { struct nft_rule_dump_ctx *ctx; - ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); if (!ctx) return -ENOMEM; if (nla[NFTA_RULE_TABLE]) { ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], - GFP_KERNEL); + GFP_ATOMIC); if (!ctx->table) { kfree(ctx); return -ENOMEM; @@ -2153,7 +2237,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, } if (nla[NFTA_RULE_CHAIN]) { ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], - GFP_KERNEL); + GFP_ATOMIC); if (!ctx->chain) { kfree(ctx->table); kfree(ctx); @@ -2163,7 +2247,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, c.data = ctx; } - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask); @@ -2184,7 +2268,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return PTR_ERR(rule); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -2225,6 +2309,53 @@ static void nf_tables_rule_release(const struct nft_ctx *ctx, nf_tables_rule_destroy(ctx, rule); } +int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) +{ + struct nft_expr *expr, *last; + const struct nft_data *data; + struct nft_rule *rule; + int err; + + list_for_each_entry(rule, &chain->rules, list) { + if (!nft_is_active_next(ctx->net, rule)) + continue; + + nft_rule_for_each_expr(expr, last, rule) { + if (!expr->ops->validate) + continue; + + err = expr->ops->validate(ctx, expr, &data); + if (err < 0) + return err; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(nft_chain_validate); + +static int nft_table_validate(struct net *net, const struct nft_table *table) +{ + struct nft_chain *chain; + struct nft_ctx ctx = { + .net = net, + .family = table->family, + }; + int err; + + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_base_chain(chain)) + continue; + + ctx.chain = chain; + err = nft_chain_validate(&ctx, chain); + if (err < 0) + return err; + } + + return 0; +} + #define NFT_RULE_MAXEXPRS 128 static struct nft_expr_info *info; @@ -2352,6 +2483,10 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, err = nf_tables_newexpr(&ctx, &info[i], expr); if (err < 0) goto err2; + + if (info[i].ops->validate) + nft_validate_state_update(net, NFT_VALIDATE_NEED); + info[i].ops = NULL; expr = nft_expr_next(expr); } @@ -2395,8 +2530,11 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, } } chain->use++; - return 0; + if (net->nft.validate_state == NFT_VALIDATE_DO) + return nft_table_validate(net, table); + + return 0; err2: nf_tables_rule_release(&ctx, rule); err1: @@ -2655,7 +2793,7 @@ static struct nft_set *nft_set_lookup(const struct nft_table *table, if (nla == NULL) return ERR_PTR(-EINVAL); - list_for_each_entry(set, &table->sets, list) { + list_for_each_entry_rcu(set, &table->sets, list) { if (!nla_strcmp(nla, set->name) && nft_active_genmask(set, genmask)) return set; @@ -2781,7 +2919,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr *nla, u64 *result) return 0; } -static u64 nf_jiffies64_to_msecs(u64 input) +static __be64 nf_jiffies64_to_msecs(u64 input) { u64 ms = jiffies64_to_nsecs(input); @@ -2960,6 +3098,7 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb) return 0; } +/* called with rcu_read_lock held */ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -2982,17 +3121,18 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_sets, .done = nf_tables_dump_sets_done, + .module = THIS_MODULE, }; struct nft_ctx *ctx_dump; - ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_KERNEL); + ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC); if (ctx_dump == NULL) return -ENOMEM; *ctx_dump = ctx; c.data = ctx_dump; - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } /* Only accept unspec with dump */ @@ -3005,7 +3145,7 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, if (IS_ERR(set)) return PTR_ERR(set); - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; @@ -3746,7 +3886,7 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set, ext = nft_set_elem_ext(set, &elem); err = -ENOMEM; - skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb == NULL) goto err1; @@ -3768,6 +3908,7 @@ err1: return err == -EAGAIN ? -ENOBUFS : err; } +/* called with rcu_read_lock held */ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -3792,10 +3933,11 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_set, .done = nf_tables_dump_set_done, + .module = THIS_MODULE, }; struct nft_set_dump_ctx *dump_ctx; - dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_KERNEL); + dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC); if (!dump_ctx) return -ENOMEM; @@ -3803,7 +3945,7 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk, dump_ctx->ctx = ctx; c.data = dump_ctx; - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } if (!nla[NFTA_SET_ELEM_LIST_ELEMENTS]) @@ -4034,6 +4176,12 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, d2.type, d2.len); if (err < 0) goto err3; + + if (d2.type == NFT_DATA_VERDICT && + (data.verdict.code == NFT_GOTO || + data.verdict.code == NFT_JUMP)) + nft_validate_state_update(ctx->net, + NFT_VALIDATE_NEED); } nft_set_ext_add_length(&tmpl, NFT_SET_EXT_DATA, d2.len); @@ -4133,7 +4281,7 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, const struct nlattr *attr; struct nft_set *set; struct nft_ctx ctx; - int rem, err = 0; + int rem, err; if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL) return -EINVAL; @@ -4154,9 +4302,13 @@ static int nf_tables_newsetelem(struct net *net, struct sock *nlsk, nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { err = nft_add_set_elem(&ctx, set, attr, nlh->nlmsg_flags); if (err < 0) - break; + return err; } - return err; + + if (net->nft.validate_state == NFT_VALIDATE_DO) + return nft_table_validate(net, ctx.table); + + return 0; } /** @@ -4426,7 +4578,7 @@ struct nft_object *nft_obj_lookup(const struct nft_table *table, { struct nft_object *obj; - list_for_each_entry(obj, &table->objects, list) { + list_for_each_entry_rcu(obj, &table->objects, list) { if (!nla_strcmp(nla, obj->name) && objtype == obj->ops->type->type && nft_active_genmask(obj, genmask)) @@ -4756,12 +4908,12 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) { struct nft_obj_filter *filter; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) return ERR_PTR(-ENOMEM); if (nla[NFTA_OBJ_TABLE]) { - filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL); + filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC); if (!filter->table) { kfree(filter); return ERR_PTR(-ENOMEM); @@ -4773,6 +4925,7 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) return filter; } +/* called with rcu_read_lock held */ static int nf_tables_getobj(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const nla[], @@ -4792,6 +4945,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_obj, .done = nf_tables_dump_obj_done, + .module = THIS_MODULE, }; if (nla[NFTA_OBJ_TABLE] || @@ -4804,7 +4958,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, c.data = filter; } - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } if (!nla[NFTA_OBJ_NAME] || @@ -4824,7 +4978,7 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, return PTR_ERR(obj); } - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -4969,7 +5123,7 @@ struct nft_flowtable *nft_flowtable_lookup(const struct nft_table *table, { struct nft_flowtable *flowtable; - list_for_each_entry(flowtable, &table->flowtables, list) { + list_for_each_entry_rcu(flowtable, &table->flowtables, list) { if (!nla_strcmp(nla, flowtable->name) && nft_active_genmask(flowtable, genmask)) return flowtable; @@ -5430,13 +5584,13 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[]) { struct nft_flowtable_filter *filter; - filter = kzalloc(sizeof(*filter), GFP_KERNEL); + filter = kzalloc(sizeof(*filter), GFP_ATOMIC); if (!filter) return ERR_PTR(-ENOMEM); if (nla[NFTA_FLOWTABLE_TABLE]) { filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE], - GFP_KERNEL); + GFP_ATOMIC); if (!filter->table) { kfree(filter); return ERR_PTR(-ENOMEM); @@ -5445,6 +5599,7 @@ nft_flowtable_filter_alloc(const struct nlattr * const nla[]) return filter; } +/* called with rcu_read_lock held */ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -5463,6 +5618,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, struct netlink_dump_control c = { .dump = nf_tables_dump_flowtable, .done = nf_tables_dump_flowtable_done, + .module = THIS_MODULE, }; if (nla[NFTA_FLOWTABLE_TABLE]) { @@ -5474,7 +5630,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, c.data = filter; } - return netlink_dump_start(nlsk, skb, nlh, &c); + return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c); } if (!nla[NFTA_FLOWTABLE_NAME]) @@ -5490,7 +5646,7 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk, if (IS_ERR(flowtable)) return PTR_ERR(flowtable); - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb2) return -ENOMEM; @@ -5654,7 +5810,7 @@ static int nf_tables_getgen(struct net *net, struct sock *nlsk, struct sk_buff *skb2; int err; - skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + skb2 = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (skb2 == NULL) return -ENOMEM; @@ -5676,7 +5832,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_table_policy, }, [NFT_MSG_GETTABLE] = { - .call = nf_tables_gettable, + .call_rcu = nf_tables_gettable, .attr_count = NFTA_TABLE_MAX, .policy = nft_table_policy, }, @@ -5691,7 +5847,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_chain_policy, }, [NFT_MSG_GETCHAIN] = { - .call = nf_tables_getchain, + .call_rcu = nf_tables_getchain, .attr_count = NFTA_CHAIN_MAX, .policy = nft_chain_policy, }, @@ -5706,7 +5862,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_rule_policy, }, [NFT_MSG_GETRULE] = { - .call = nf_tables_getrule, + .call_rcu = nf_tables_getrule, .attr_count = NFTA_RULE_MAX, .policy = nft_rule_policy, }, @@ -5721,7 +5877,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_policy, }, [NFT_MSG_GETSET] = { - .call = nf_tables_getset, + .call_rcu = nf_tables_getset, .attr_count = NFTA_SET_MAX, .policy = nft_set_policy, }, @@ -5736,7 +5892,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETSETELEM] = { - .call = nf_tables_getsetelem, + .call_rcu = nf_tables_getsetelem, .attr_count = NFTA_SET_ELEM_LIST_MAX, .policy = nft_set_elem_list_policy, }, @@ -5746,7 +5902,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_set_elem_list_policy, }, [NFT_MSG_GETGEN] = { - .call = nf_tables_getgen, + .call_rcu = nf_tables_getgen, }, [NFT_MSG_NEWOBJ] = { .call_batch = nf_tables_newobj, @@ -5754,7 +5910,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ] = { - .call = nf_tables_getobj, + .call_rcu = nf_tables_getobj, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, @@ -5764,7 +5920,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_obj_policy, }, [NFT_MSG_GETOBJ_RESET] = { - .call = nf_tables_getobj, + .call_rcu = nf_tables_getobj, .attr_count = NFTA_OBJ_MAX, .policy = nft_obj_policy, }, @@ -5774,7 +5930,7 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { .policy = nft_flowtable_policy, }, [NFT_MSG_GETFLOWTABLE] = { - .call = nf_tables_getflowtable, + .call_rcu = nf_tables_getflowtable, .attr_count = NFTA_FLOWTABLE_MAX, .policy = nft_flowtable_policy, }, @@ -5785,6 +5941,27 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = { }, }; +static int nf_tables_validate(struct net *net) +{ + struct nft_table *table; + + switch (net->nft.validate_state) { + case NFT_VALIDATE_SKIP: + break; + case NFT_VALIDATE_NEED: + nft_validate_state_update(net, NFT_VALIDATE_DO); + /* fall through */ + case NFT_VALIDATE_DO: + list_for_each_entry(table, &net->nft.tables, list) { + if (nft_table_validate(net, table) < 0) + return -EAGAIN; + } + break; + } + + return 0; +} + static void nft_chain_commit_update(struct nft_trans *trans) { struct nft_base_chain *basechain; @@ -5850,21 +6027,166 @@ static void nf_tables_commit_release(struct net *net) } } +static int nf_tables_commit_chain_prepare(struct net *net, struct nft_chain *chain) +{ + struct nft_rule *rule; + unsigned int alloc = 0; + int i; + + /* already handled or inactive chain? */ + if (chain->rules_next || !nft_is_active_next(net, chain)) + return 0; + + rule = list_entry(&chain->rules, struct nft_rule, list); + i = 0; + + list_for_each_entry_continue(rule, &chain->rules, list) { + if (nft_is_active_next(net, rule)) + alloc++; + } + + chain->rules_next = nf_tables_chain_alloc_rules(chain, alloc); + if (!chain->rules_next) + return -ENOMEM; + + list_for_each_entry_continue(rule, &chain->rules, list) { + if (nft_is_active_next(net, rule)) + chain->rules_next[i++] = rule; + } + + chain->rules_next[i] = NULL; + return 0; +} + +static void nf_tables_commit_chain_prepare_cancel(struct net *net) +{ + struct nft_trans *trans, *next; + + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + struct nft_chain *chain = trans->ctx.chain; + + if (trans->msg_type == NFT_MSG_NEWRULE || + trans->msg_type == NFT_MSG_DELRULE) { + kvfree(chain->rules_next); + chain->rules_next = NULL; + } + } +} + +static void __nf_tables_commit_chain_free_rules_old(struct rcu_head *h) +{ + struct nft_rules_old *o = container_of(h, struct nft_rules_old, h); + + kvfree(o->start); +} + +static void nf_tables_commit_chain_free_rules_old(struct nft_rule **rules) +{ + struct nft_rule **r = rules; + struct nft_rules_old *old; + + while (*r) + r++; + + r++; /* rcu_head is after end marker */ + old = (void *) r; + old->start = rules; + + call_rcu(&old->h, __nf_tables_commit_chain_free_rules_old); +} + +static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *chain) +{ + struct nft_rule **g0, **g1; + bool next_genbit; + + next_genbit = nft_gencursor_next(net); + + g0 = rcu_dereference_protected(chain->rules_gen_0, + lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + g1 = rcu_dereference_protected(chain->rules_gen_1, + lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES)); + + /* No changes to this chain? */ + if (chain->rules_next == NULL) { + /* chain had no change in last or next generation */ + if (g0 == g1) + return; + /* + * chain had no change in this generation; make sure next + * one uses same rules as current generation. + */ + if (next_genbit) { + rcu_assign_pointer(chain->rules_gen_1, g0); + nf_tables_commit_chain_free_rules_old(g1); + } else { + rcu_assign_pointer(chain->rules_gen_0, g1); + nf_tables_commit_chain_free_rules_old(g0); + } + + return; + } + + if (next_genbit) + rcu_assign_pointer(chain->rules_gen_1, chain->rules_next); + else + rcu_assign_pointer(chain->rules_gen_0, chain->rules_next); + + chain->rules_next = NULL; + + if (g0 == g1) + return; + + if (next_genbit) + nf_tables_commit_chain_free_rules_old(g1); + else + nf_tables_commit_chain_free_rules_old(g0); +} + static int nf_tables_commit(struct net *net, struct sk_buff *skb) { struct nft_trans *trans, *next; struct nft_trans_elem *te; + struct nft_chain *chain; + struct nft_table *table; - /* Bump generation counter, invalidate any dump in progress */ - while (++net->nft.base_seq == 0); + /* 0. Validate ruleset, otherwise roll back for error reporting. */ + if (nf_tables_validate(net) < 0) + return -EAGAIN; - /* A new generation has just started */ - net->nft.gencursor = nft_gencursor_next(net); + /* 1. Allocate space for next generation rules_gen_X[] */ + list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { + int ret; + + if (trans->msg_type == NFT_MSG_NEWRULE || + trans->msg_type == NFT_MSG_DELRULE) { + chain = trans->ctx.chain; - /* Make sure all packets have left the previous generation before - * purging old rules. + ret = nf_tables_commit_chain_prepare(net, chain); + if (ret < 0) { + nf_tables_commit_chain_prepare_cancel(net); + return ret; + } + } + } + + /* step 2. Make rules_gen_X visible to packet path */ + list_for_each_entry(table, &net->nft.tables, list) { + list_for_each_entry(chain, &table->chains, list) { + if (!nft_is_active_next(net, chain)) + continue; + nf_tables_commit_chain_active(net, chain); + } + } + + /* + * Bump generation counter, invalidate any dump in progress. + * Cannot fail after this point. */ - synchronize_rcu(); + while (++net->nft.base_seq == 0); + + /* step 3. Start new generation, rules_gen_X now in use. */ + net->nft.gencursor = nft_gencursor_next(net); list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) { switch (trans->msg_type) { @@ -6126,6 +6448,11 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb) return 0; } +static void nf_tables_cleanup(struct net *net) +{ + nft_validate_state_update(net, NFT_VALIDATE_SKIP); +} + static bool nf_tables_valid_genid(struct net *net, u32 genid) { return net->nft.base_seq == genid; @@ -6138,6 +6465,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = { .cb = nf_tables_cb, .commit = nf_tables_commit, .abort = nf_tables_abort, + .cleanup = nf_tables_cleanup, .valid_genid = nf_tables_valid_genid, }; @@ -6221,19 +6549,18 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, list_for_each_entry(rule, &chain->rules, list) { nft_rule_for_each_expr(expr, last, rule) { - const struct nft_data *data = NULL; + struct nft_immediate_expr *priv; + const struct nft_data *data; int err; - if (!expr->ops->validate) + if (strcmp(expr->ops->type->name, "immediate")) continue; - err = expr->ops->validate(ctx, expr, &data); - if (err < 0) - return err; - - if (data == NULL) + priv = nft_expr_priv(expr); + if (priv->dreg != NFT_REG_VERDICT) continue; + data = &priv->data; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: @@ -6713,6 +7040,8 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&net->nft.tables); INIT_LIST_HEAD(&net->nft.commit_list); net->nft.base_seq = 1; + net->nft.validate_state = NFT_VALIDATE_SKIP; + return 0; } diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index 4f46d2f4e167..47cf667b15ca 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -23,22 +23,6 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> -static const char *const comments[__NFT_TRACETYPE_MAX] = { - [NFT_TRACETYPE_POLICY] = "policy", - [NFT_TRACETYPE_RETURN] = "return", - [NFT_TRACETYPE_RULE] = "rule", -}; - -static const struct nf_loginfo trace_loginfo = { - .type = NF_LOG_TYPE_LOG, - .u = { - .log = { - .level = LOGLEVEL_WARNING, - .logflags = NF_LOG_DEFAULT_MASK, - }, - }, -}; - static noinline void __nft_trace_packet(struct nft_traceinfo *info, const struct nft_chain *chain, enum nft_trace_types type) @@ -133,7 +117,7 @@ static noinline void nft_update_chain_stats(const struct nft_chain *chain, struct nft_jumpstack { const struct nft_chain *chain; - const struct nft_rule *rule; + struct nft_rule *const *rules; }; unsigned int @@ -141,27 +125,29 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv) { const struct nft_chain *chain = priv, *basechain = chain; const struct net *net = nft_net(pkt); + struct nft_rule *const *rules; const struct nft_rule *rule; const struct nft_expr *expr, *last; struct nft_regs regs; unsigned int stackptr = 0; struct nft_jumpstack jumpstack[NFT_JUMP_STACK_SIZE]; - unsigned int gencursor = nft_genmask_cur(net); + bool genbit = READ_ONCE(net->nft.gencursor); struct nft_traceinfo info; info.trace = false; if (static_branch_unlikely(&nft_trace_enabled)) nft_trace_init(&info, pkt, ®s.verdict, basechain); do_chain: - rule = list_entry(&chain->rules, struct nft_rule, list); + if (genbit) + rules = rcu_dereference(chain->rules_gen_1); + else + rules = rcu_dereference(chain->rules_gen_0); + next_rule: + rule = *rules; regs.verdict.code = NFT_CONTINUE; - list_for_each_entry_continue_rcu(rule, &chain->rules, list) { - - /* This rule is not active, skip. */ - if (unlikely(rule->genmask & gencursor)) - continue; - + for (; *rules ; rules++) { + rule = *rules; nft_rule_for_each_expr(expr, last, rule) { if (expr->ops == &nft_cmp_fast_ops) nft_cmp_fast_eval(expr, ®s); @@ -199,7 +185,7 @@ next_rule: case NFT_JUMP: BUG_ON(stackptr >= NFT_JUMP_STACK_SIZE); jumpstack[stackptr].chain = chain; - jumpstack[stackptr].rule = rule; + jumpstack[stackptr].rules = rules + 1; stackptr++; /* fall through */ case NFT_GOTO: @@ -221,7 +207,7 @@ next_rule: if (stackptr > 0) { stackptr--; chain = jumpstack[stackptr].chain; - rule = jumpstack[stackptr].rule; + rules = jumpstack[stackptr].rules; goto next_rule; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 03ead8a9e90c..4d0da7042aff 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -25,6 +25,7 @@ #include <linux/uaccess.h> #include <net/sock.h> #include <linux/init.h> +#include <linux/sched/signal.h> #include <net/netlink.h> #include <linux/netfilter/nfnetlink.h> @@ -37,6 +38,8 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER); rcu_dereference_protected(table[(id)].subsys, \ lockdep_nfnl_is_held((id))) +#define NFNL_MAX_ATTR_COUNT 32 + static struct { struct mutex mutex; const struct nfnetlink_subsystem __rcu *subsys; @@ -76,6 +79,13 @@ EXPORT_SYMBOL_GPL(lockdep_nfnl_is_held); int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n) { + u8 cb_id; + + /* Sanity-check attr_count size to avoid stack buffer overflow. */ + for (cb_id = 0; cb_id < n->cb_count; cb_id++) + if (WARN_ON(n->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT)) + return -EINVAL; + nfnl_lock(n->subsys_id); if (table[n->subsys_id].subsys) { nfnl_unlock(n->subsys_id); @@ -185,11 +195,17 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); + /* Sanity-check NFNL_MAX_ATTR_COUNT */ + if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { + rcu_read_unlock(); + return -ENOMEM; + } + err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, extack); if (err < 0) { @@ -330,6 +346,13 @@ replay: while (skb->len >= nlmsg_total_size(0)) { int msglen, type; + if (fatal_signal_pending(current)) { + nfnl_err_reset(&err_list); + err = -EINTR; + status = NFNL_BATCH_FAILURE; + goto done; + } + memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; @@ -379,10 +402,16 @@ replay: { int min_len = nlmsg_total_size(sizeof(struct nfgenmsg)); u8 cb_id = NFNL_MSG_TYPE(nlh->nlmsg_type); - struct nlattr *cda[ss->cb[cb_id].attr_count + 1]; + struct nlattr *cda[NFNL_MAX_ATTR_COUNT + 1]; struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; + /* Sanity-check NFTA_MAX_ATTR */ + if (ss->cb[cb_id].attr_count > NFNL_MAX_ATTR_COUNT) { + err = -ENOMEM; + goto ack; + } + err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, ss->cb[cb_id].policy, NULL); if (err < 0) @@ -441,10 +470,19 @@ done: kfree_skb(skb); goto replay; } else if (status == NFNL_BATCH_DONE) { - ss->commit(net, oskb); + err = ss->commit(net, oskb); + if (err == -EAGAIN) { + status |= NFNL_BATCH_REPLAY; + goto done; + } else if (err) { + ss->abort(net, oskb); + netlink_ack(oskb, nlmsg_hdr(oskb), err, NULL); + } } else { ss->abort(net, oskb); } + if (ss->cleanup) + ss->cleanup(net); nfnl_err_deliver(&err_list, oskb); nfnl_unlock(subsys_id); diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 1d99a1efdafc..8d1ff654e5af 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -611,10 +611,10 @@ nla_put_failure: return -1; } -static int nfnl_compat_get(struct net *net, struct sock *nfnl, - struct sk_buff *skb, const struct nlmsghdr *nlh, - const struct nlattr * const tb[], - struct netlink_ext_ack *extack) +static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, + struct sk_buff *skb, const struct nlmsghdr *nlh, + const struct nlattr * const tb[], + struct netlink_ext_ack *extack) { int ret = 0, target; struct nfgenmsg *nfmsg; @@ -653,16 +653,21 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl, return -EINVAL; } + if (!try_module_get(THIS_MODULE)) + return -EINVAL; + + rcu_read_unlock(); try_then_request_module(xt_find_revision(nfmsg->nfgen_family, name, rev, target, &ret), fmt, name); - if (ret < 0) - return ret; + goto out_put; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) - return -ENOMEM; + if (skb2 == NULL) { + ret = -ENOMEM; + goto out_put; + } /* include the best revision for this extension in the message */ if (nfnl_compat_fill_info(skb2, NETLINK_CB(skb).portid, @@ -672,14 +677,16 @@ static int nfnl_compat_get(struct net *net, struct sock *nfnl, nfmsg->nfgen_family, name, ret, target) <= 0) { kfree_skb(skb2); - return -ENOSPC; + goto out_put; } ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); if (ret > 0) ret = 0; - +out_put: + rcu_read_lock(); + module_put(THIS_MODULE); return ret == -EAGAIN ? -ENOBUFS : ret; } @@ -691,7 +698,7 @@ static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = { }; static const struct nfnl_callback nfnl_nft_compat_cb[NFNL_MSG_COMPAT_MAX] = { - [NFNL_MSG_COMPAT_GET] = { .call = nfnl_compat_get, + [NFNL_MSG_COMPAT_GET] = { .call_rcu = nfnl_compat_get_rcu, .attr_count = NFTA_COMPAT_MAX, .policy = nfnl_compat_policy_get }, }; diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index ce13a50b9189..8abb9891cdf2 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -12,8 +12,12 @@ #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <linux/ip.h> +#include <linux/ipv6.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_dup_netdev.h> +#include <net/neighbour.h> +#include <net/ip.h> struct nft_fwd_netdev { enum nft_registers sreg_dev:8; @@ -32,6 +36,8 @@ static void nft_fwd_netdev_eval(const struct nft_expr *expr, static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, + [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, + [NFTA_FWD_NFPROTO] = { .type = NLA_U32 }, }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, @@ -62,7 +68,133 @@ nla_put_failure: return -1; } +struct nft_fwd_neigh { + enum nft_registers sreg_dev:8; + enum nft_registers sreg_addr:8; + u8 nfproto; +}; + +static void nft_fwd_neigh_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_fwd_neigh *priv = nft_expr_priv(expr); + void *addr = ®s->data[priv->sreg_addr]; + int oif = regs->data[priv->sreg_dev]; + unsigned int verdict = NF_STOLEN; + struct sk_buff *skb = pkt->skb; + struct net_device *dev; + int neigh_table; + + switch (priv->nfproto) { + case NFPROTO_IPV4: { + struct iphdr *iph; + + if (skb->protocol != htons(ETH_P_IP)) { + verdict = NFT_BREAK; + goto out; + } + if (skb_try_make_writable(skb, sizeof(*iph))) { + verdict = NF_DROP; + goto out; + } + iph = ip_hdr(skb); + ip_decrease_ttl(iph); + neigh_table = NEIGH_ARP_TABLE; + break; + } + case NFPROTO_IPV6: { + struct ipv6hdr *ip6h; + + if (skb->protocol != htons(ETH_P_IPV6)) { + verdict = NFT_BREAK; + goto out; + } + if (skb_try_make_writable(skb, sizeof(*ip6h))) { + verdict = NF_DROP; + goto out; + } + ip6h = ipv6_hdr(skb); + ip6h->hop_limit--; + neigh_table = NEIGH_ND_TABLE; + break; + } + default: + verdict = NFT_BREAK; + goto out; + } + + dev = dev_get_by_index_rcu(nft_net(pkt), oif); + if (dev == NULL) + return; + + skb->dev = dev; + neigh_xmit(neigh_table, dev, addr, skb); +out: + regs->verdict.code = verdict; +} + +static int nft_fwd_neigh_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_fwd_neigh *priv = nft_expr_priv(expr); + unsigned int addr_len; + int err; + + if (!tb[NFTA_FWD_SREG_DEV] || + !tb[NFTA_FWD_SREG_ADDR] || + !tb[NFTA_FWD_NFPROTO]) + return -EINVAL; + + priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]); + priv->sreg_addr = nft_parse_register(tb[NFTA_FWD_SREG_ADDR]); + priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); + + switch (priv->nfproto) { + case NFPROTO_IPV4: + addr_len = sizeof(struct in_addr); + break; + case NFPROTO_IPV6: + addr_len = sizeof(struct in6_addr); + break; + default: + return -EOPNOTSUPP; + } + + err = nft_validate_register_load(priv->sreg_dev, sizeof(int)); + if (err < 0) + return err; + + return nft_validate_register_load(priv->sreg_addr, addr_len); +} + +static const struct nft_expr_ops nft_fwd_netdev_ingress_ops; + +static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + struct nft_fwd_neigh *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) || + nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) || + nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto))) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + static struct nft_expr_type nft_fwd_netdev_type; +static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { + .type = &nft_fwd_netdev_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)), + .eval = nft_fwd_neigh_eval, + .init = nft_fwd_neigh_init, + .dump = nft_fwd_neigh_dump, +}; + static const struct nft_expr_ops nft_fwd_netdev_ops = { .type = &nft_fwd_netdev_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)), @@ -71,10 +203,22 @@ static const struct nft_expr_ops nft_fwd_netdev_ops = { .dump = nft_fwd_netdev_dump, }; +static const struct nft_expr_ops * +nft_fwd_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (tb[NFTA_FWD_SREG_ADDR]) + return &nft_fwd_neigh_netdev_ops; + if (tb[NFTA_FWD_SREG_DEV]) + return &nft_fwd_netdev_ops; + + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_fwd_netdev_type __read_mostly = { .family = NFPROTO_NETDEV, .name = "fwd", - .ops = &nft_fwd_netdev_ops, + .select_ops = nft_fwd_select_ops, .policy = nft_fwd_netdev_policy, .maxattr = NFTA_FWD_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index f0fc21f88775..c2d237144f74 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -177,10 +177,7 @@ static int nft_jhash_map_init(const struct nft_ctx *ctx, priv->map = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_HASH_SET_NAME], tb[NFTA_HASH_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_symhash_init(const struct nft_ctx *ctx, @@ -220,10 +217,7 @@ static int nft_symhash_map_init(const struct nft_ctx *ctx, priv->map = nft_set_lookup_global(ctx->net, ctx->table, tb[NFTA_HASH_SET_NAME], tb[NFTA_HASH_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_jhash_dump(struct sk_buff *skb, diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index aa87ff8beae8..15adf8ca82c3 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -17,12 +17,6 @@ #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> -struct nft_immediate_expr { - struct nft_data data; - enum nft_registers dreg:8; - u8 dlen; -}; - static void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -101,12 +95,27 @@ nla_put_failure: static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, - const struct nft_data **data) + const struct nft_data **d) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); + const struct nft_data *data; + int err; - if (priv->dreg == NFT_REG_VERDICT) - *data = &priv->data; + if (priv->dreg != NFT_REG_VERDICT) + return 0; + + data = &priv->data; + + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + err = nft_chain_validate(ctx, data->verdict.chain); + if (err < 0) + return err; + break; + default: + break; + } return 0; } diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index a27be36dc0af..7eef1cffbf1b 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -9,12 +9,15 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ +#include <linux/audit.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> +#include <net/ipv6.h> +#include <net/ip.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netdevice.h> @@ -26,12 +29,93 @@ struct nft_log { char *prefix; }; +static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct iphdr _iph; + const struct iphdr *ih; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); + if (!ih) + return false; + + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", + &ih->saddr, &ih->daddr, ih->protocol); + + return true; +} + +static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +{ + struct ipv6hdr _ip6h; + const struct ipv6hdr *ih; + u8 nexthdr; + __be16 frag_off; + + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); + if (!ih) + return false; + + nexthdr = ih->nexthdr; + ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); + + audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", + &ih->saddr, &ih->daddr, nexthdr); + + return true; +} + +static void nft_log_eval_audit(const struct nft_pktinfo *pkt) +{ + struct sk_buff *skb = pkt->skb; + struct audit_buffer *ab; + int fam = -1; + + if (!audit_enabled) + return; + + ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); + if (!ab) + return; + + audit_log_format(ab, "mark=%#x", skb->mark); + + switch (nft_pf(pkt)) { + case NFPROTO_BRIDGE: + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; + break; + case htons(ETH_P_IPV6): + fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; + break; + } + break; + case NFPROTO_IPV4: + fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; + break; + case NFPROTO_IPV6: + fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; + break; + } + + if (fam == -1) + audit_log_format(ab, " saddr=? daddr=? proto=-1"); + + audit_log_end(ab); +} + static void nft_log_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_log *priv = nft_expr_priv(expr); + if (priv->loginfo.type == NF_LOG_TYPE_LOG && + priv->loginfo.u.log.level == LOGLEVEL_AUDIT) { + nft_log_eval_audit(pkt); + return; + } + nf_log_packet(nft_net(pkt), nft_pf(pkt), nft_hook(pkt), pkt->skb, nft_in(pkt), nft_out(pkt), &priv->loginfo, "%s", priv->prefix); @@ -84,7 +168,7 @@ static int nft_log_init(const struct nft_ctx *ctx, } else { li->u.log.level = LOGLEVEL_WARNING; } - if (li->u.log.level > LOGLEVEL_DEBUG) { + if (li->u.log.level > LOGLEVEL_AUDIT) { err = -EINVAL; goto err1; } @@ -112,6 +196,9 @@ static int nft_log_init(const struct nft_ctx *ctx, break; } + if (li->u.log.level == LOGLEVEL_AUDIT) + return 0; + err = nf_logger_find_get(ctx->family, li->type); if (err < 0) goto err1; @@ -133,6 +220,9 @@ static void nft_log_destroy(const struct nft_ctx *ctx, if (priv->prefix != nft_log_null_prefix) kfree(priv->prefix); + if (li->u.log.level == LOGLEVEL_AUDIT) + return; + nf_logger_put(ctx->family, li->type); } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index f52da5e2199f..42e6fadf1417 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -149,6 +149,52 @@ nla_put_failure: return -1; } +static int nft_lookup_validate_setelem(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) +{ + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + const struct nft_data *data; + + if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) && + *nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END) + return 0; + + data = nft_set_ext_data(ext); + switch (data->verdict.code) { + case NFT_JUMP: + case NFT_GOTO: + return nft_chain_validate(ctx, data->verdict.chain); + default: + return 0; + } +} + +static int nft_lookup_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **d) +{ + const struct nft_lookup *priv = nft_expr_priv(expr); + struct nft_set_iter iter; + + if (!(priv->set->flags & NFT_SET_MAP) || + priv->set->dtype != NFT_DATA_VERDICT) + return 0; + + iter.genmask = nft_genmask_next(ctx->net); + iter.skip = 0; + iter.count = 0; + iter.err = 0; + iter.fn = nft_lookup_validate_setelem; + + priv->set->ops->walk(ctx, priv->set, &iter); + if (iter.err < 0) + return iter.err; + + return 0; +} + static const struct nft_expr_ops nft_lookup_ops = { .type = &nft_lookup_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_lookup)), @@ -156,6 +202,7 @@ static const struct nft_expr_ops nft_lookup_ops = { .init = nft_lookup_init, .destroy = nft_lookup_destroy, .dump = nft_lookup_dump, + .validate = nft_lookup_validate, }; struct nft_expr_type nft_lookup_type __read_mostly = { diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index cdbc62a53933..1f4d0854cf70 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -114,10 +114,7 @@ static int nft_ng_inc_map_init(const struct nft_ctx *ctx, tb[NFTA_NG_SET_NAME], tb[NFTA_NG_SET_ID], genmask); - if (IS_ERR(priv->map)) - return PTR_ERR(priv->map); - - return 0; + return PTR_ERR_OR_ZERO(priv->map); } static int nft_ng_dump(struct sk_buff *skb, enum nft_registers dreg, diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c new file mode 100644 index 000000000000..d86337068ecb --- /dev/null +++ b/net/netfilter/nft_socket.c @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/module.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_socket.h> +#include <net/inet_sock.h> + +struct nft_socket { + enum nft_socket_keys key:8; + union { + enum nft_registers dreg:8; + }; +}; + +static void nft_socket_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_socket *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + struct sock *sk = skb->sk; + u32 *dest = ®s->data[priv->dreg]; + + if (!sk) + switch(nft_pf(pkt)) { + case NFPROTO_IPV4: + sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt)); + break; +#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) + case NFPROTO_IPV6: + sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt)); + break; +#endif + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + return; + } + + if(!sk) { + nft_reg_store8(dest, 0); + return; + } + + /* So that subsequent socket matching not to require other lookups. */ + skb->sk = sk; + + switch(priv->key) { + case NFT_SOCKET_TRANSPARENT: + nft_reg_store8(dest, nf_sk_is_transparent(sk)); + break; + default: + WARN_ON(1); + regs->verdict.code = NFT_BREAK; + } +} + +static const struct nla_policy nft_socket_policy[NFTA_SOCKET_MAX + 1] = { + [NFTA_SOCKET_KEY] = { .type = NLA_U32 }, + [NFTA_SOCKET_DREG] = { .type = NLA_U32 }, +}; + +static int nft_socket_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_socket *priv = nft_expr_priv(expr); + unsigned int len; + + if (!tb[NFTA_SOCKET_DREG] || !tb[NFTA_SOCKET_KEY]) + return -EINVAL; + + switch(ctx->family) { + case NFPROTO_IPV4: +#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6) + case NFPROTO_IPV6: +#endif + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + + priv->key = ntohl(nla_get_u32(tb[NFTA_SOCKET_KEY])); + switch(priv->key) { + case NFT_SOCKET_TRANSPARENT: + len = sizeof(u8); + break; + default: + return -EOPNOTSUPP; + } + + priv->dreg = nft_parse_register(tb[NFTA_SOCKET_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +static int nft_socket_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_socket *priv = nft_expr_priv(expr); + + if (nla_put_u32(skb, NFTA_SOCKET_KEY, htonl(priv->key))) + return -1; + if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) + return -1; + return 0; +} + +static struct nft_expr_type nft_socket_type; +static const struct nft_expr_ops nft_socket_ops = { + .type = &nft_socket_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_socket)), + .eval = nft_socket_eval, + .init = nft_socket_init, + .dump = nft_socket_dump, +}; + +static struct nft_expr_type nft_socket_type __read_mostly = { + .name = "socket", + .ops = &nft_socket_ops, + .policy = nft_socket_policy, + .maxattr = NFTA_SOCKET_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_socket_module_init(void) +{ + return nft_register_expr(&nft_socket_type); +} + +static void __exit nft_socket_module_exit(void) +{ + nft_unregister_expr(&nft_socket_type); +} + +module_init(nft_socket_module_init); +module_exit(nft_socket_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Máté Eckl"); +MODULE_DESCRIPTION("nf_tables socket match module"); |