From 719f8bcc883e7992615f4d5625922e24995e2d98 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 13 Aug 2012 17:03:00 -0400 Subject: svcrpc: fix xpt_list traversal locking on shutdown Server threads are not running at this point, but svc_age_temp_xprts still may be, so we need this locking. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index bac973a31367..e1810b947dea 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -917,16 +917,18 @@ void svc_close_xprt(struct svc_xprt *xprt) } EXPORT_SYMBOL_GPL(svc_close_xprt); -static void svc_close_list(struct list_head *xprt_list, struct net *net) +static void svc_close_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) { struct svc_xprt *xprt; + spin_lock(&serv->sv_lock); list_for_each_entry(xprt, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; set_bit(XPT_CLOSE, &xprt->xpt_flags); set_bit(XPT_BUSY, &xprt->xpt_flags); } + spin_unlock(&serv->sv_lock); } static void svc_clear_pools(struct svc_serv *serv, struct net *net) @@ -949,24 +951,28 @@ static void svc_clear_pools(struct svc_serv *serv, struct net *net) } } -static void svc_clear_list(struct list_head *xprt_list, struct net *net) +static void svc_clear_list(struct svc_serv *serv, struct list_head *xprt_list, struct net *net) { struct svc_xprt *xprt; struct svc_xprt *tmp; + LIST_HEAD(victims); + spin_lock(&serv->sv_lock); list_for_each_entry_safe(xprt, tmp, xprt_list, xpt_list) { if (xprt->xpt_net != net) continue; - svc_delete_xprt(xprt); + list_move(&xprt->xpt_list, &victims); } - list_for_each_entry(xprt, xprt_list, xpt_list) - BUG_ON(xprt->xpt_net == net); + spin_unlock(&serv->sv_lock); + + list_for_each_entry_safe(xprt, tmp, &victims, xpt_list) + svc_delete_xprt(xprt); } void svc_close_net(struct svc_serv *serv, struct net *net) { - svc_close_list(&serv->sv_tempsocks, net); - svc_close_list(&serv->sv_permsocks, net); + svc_close_list(serv, &serv->sv_tempsocks, net); + svc_close_list(serv, &serv->sv_permsocks, net); svc_clear_pools(serv, net); /* @@ -974,8 +980,8 @@ void svc_close_net(struct svc_serv *serv, struct net *net) * svc_xprt_enqueue will not add new entries without taking the * sp_lock and checking XPT_BUSY. */ - svc_clear_list(&serv->sv_tempsocks, net); - svc_clear_list(&serv->sv_permsocks, net); + svc_clear_list(serv, &serv->sv_tempsocks, net); + svc_clear_list(serv, &serv->sv_permsocks, net); } /* -- cgit v1.2.3 From 72c3537607e42928f13691d59579ec840014b19e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 13 Aug 2012 17:46:17 -0400 Subject: svcrpc: standardize svc_setup_socket return convention Use the kernel-standard ptr-or-error return convention instead of passing a pointer to the error. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 998aa8c1807c..d028b51a69ad 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -59,7 +59,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, - int *errp, int flags); + int flags); static void svc_udp_data_ready(struct sock *, int); static int svc_udp_recvfrom(struct svc_rqst *); static int svc_udp_sendto(struct svc_rqst *); @@ -900,8 +900,9 @@ static struct svc_xprt *svc_tcp_accept(struct svc_xprt *xprt) */ newsock->sk->sk_sndtimeo = HZ*30; - if (!(newsvsk = svc_setup_socket(serv, newsock, &err, - (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)))) + newsvsk = svc_setup_socket(serv, newsock, + (SVC_SOCK_ANONYMOUS | SVC_SOCK_TEMPORARY)); + if (IS_ERR(newsvsk)) goto failed; svc_xprt_set_remote(&newsvsk->sk_xprt, sin, slen); err = kernel_getsockname(newsock, sin, &slen); @@ -1383,29 +1384,29 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs); */ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct socket *sock, - int *errp, int flags) + int flags) { struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int err = 0; dprintk("svc: svc_setup_socket %p\n", sock); - if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { - *errp = -ENOMEM; - return NULL; - } + svsk = kzalloc(sizeof(*svsk), GFP_KERNEL); + if (!svsk) + return ERR_PTR(-ENOMEM); inet = sock->sk; /* Register socket with portmapper */ - if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, sock_net(sock->sk), inet->sk_family, + if (pmap_register) + err = svc_register(serv, sock_net(sock->sk), inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); - if (*errp < 0) { + if (err < 0) { kfree(svsk); - return NULL; + return ERR_PTR(err); } inet->sk_user_data = svsk; @@ -1463,10 +1464,12 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, else { if (!try_module_get(THIS_MODULE)) err = -ENOENT; - else - svsk = svc_setup_socket(serv, so, &err, - SVC_SOCK_DEFAULTS); - if (svsk) { + else { + svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS); + if (IS_ERR(svsk)) + err = PTR_ERR(svsk); + } + if (err == 0) { struct sockaddr_storage addr; struct sockaddr *sin = (struct sockaddr *)&addr; int salen; @@ -1563,11 +1566,12 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, goto bummer; } - if ((svsk = svc_setup_socket(serv, sock, &error, flags)) != NULL) { + svsk = svc_setup_socket(serv, sock, flags); + if (!IS_ERR(svsk)) { svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); return (struct svc_xprt *)svsk; } - + error = PTR_ERR(svsk); bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); -- cgit v1.2.3 From a8e10078a87c8a2c3c8d0f9856c0f74272fc0f74 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 13 Aug 2012 18:01:03 -0400 Subject: svcrpc: clean up control flow Mainly, use the kernel standard err = -ERROR; if (something_bad) goto out; normal case; rather than if (something_bad) err = -ERROR else { normal case; } Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 69 ++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index d028b51a69ad..bf10b723f429 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1451,44 +1451,42 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, int err = 0; struct socket *so = sockfd_lookup(fd, &err); struct svc_sock *svsk = NULL; + struct sockaddr_storage addr; + struct sockaddr *sin = (struct sockaddr *)&addr; + int salen; if (!so) return err; + err = -EAFNOSUPPORT; if ((so->sk->sk_family != PF_INET) && (so->sk->sk_family != PF_INET6)) - err = -EAFNOSUPPORT; - else if (so->sk->sk_protocol != IPPROTO_TCP && + goto out; + err = -EPROTONOSUPPORT; + if (so->sk->sk_protocol != IPPROTO_TCP && so->sk->sk_protocol != IPPROTO_UDP) - err = -EPROTONOSUPPORT; - else if (so->state > SS_UNCONNECTED) - err = -EISCONN; - else { - if (!try_module_get(THIS_MODULE)) - err = -ENOENT; - else { - svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS); - if (IS_ERR(svsk)) - err = PTR_ERR(svsk); - } - if (err == 0) { - struct sockaddr_storage addr; - struct sockaddr *sin = (struct sockaddr *)&addr; - int salen; - if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) - svc_xprt_set_local(&svsk->sk_xprt, sin, salen); - clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - spin_lock_bh(&serv->sv_lock); - list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(&svsk->sk_xprt); - err = 0; - } else - module_put(THIS_MODULE); - } - if (err) { - sockfd_put(so); - return err; + goto out; + err = -EISCONN; + if (so->state > SS_UNCONNECTED) + goto out; + err = -ENOENT; + if (!try_module_get(THIS_MODULE)) + goto out; + svsk = svc_setup_socket(serv, so, SVC_SOCK_DEFAULTS); + if (IS_ERR(svsk)) { + module_put(THIS_MODULE); + err = PTR_ERR(svsk); + goto out; } + if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) + svc_xprt_set_local(&svsk->sk_xprt, sin, salen); + clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(&svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); +out: + sockfd_put(so); + return err; } EXPORT_SYMBOL_GPL(svc_addsock); @@ -1567,11 +1565,12 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, } svsk = svc_setup_socket(serv, sock, flags); - if (!IS_ERR(svsk)) { - svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); - return (struct svc_xprt *)svsk; + if (IS_ERR(svsk)) { + error = PTR_ERR(svsk); + goto bummer; } - error = PTR_ERR(svsk); + svc_xprt_set_local(&svsk->sk_xprt, newsin, newlen); + return (struct svc_xprt *)svsk; bummer: dprintk("svc: svc_create_socket error = %d\n", -error); sock_release(sock); -- cgit v1.2.3 From c3341966943284ab3618a1814cefd693ad9aa736 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 14 Aug 2012 15:27:23 -0400 Subject: svcrpc: make svc_create_xprt enqueue on clearing XPT_BUSY Whenever we clear XPT_BUSY we should call svc_xprt_enqueue(). Without that we may fail to notice any events (such as new connections) that arrived while XPT_BUSY was set. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index e1810b947dea..4801fdac2c9d 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -238,7 +238,7 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, list_add(&newxprt->xpt_list, &serv->sv_permsocks); spin_unlock_bh(&serv->sv_lock); newport = svc_xprt_local_port(newxprt); - clear_bit(XPT_BUSY, &newxprt->xpt_flags); + svc_xprt_received(newxprt); return newport; } err: -- cgit v1.2.3 From 39b553013719fe6495cf5e496b827b2d712e4265 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 14 Aug 2012 15:50:34 -0400 Subject: svcrpc: share some setup of listening sockets There's some duplicate code here. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 16 ++++++++++------ net/sunrpc/svcsock.c | 6 +----- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 4801fdac2c9d..ee15663798b3 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -208,6 +208,15 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); } +void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) +{ + clear_bit(XPT_TEMP, &new->xpt_flags); + spin_lock_bh(&serv->sv_lock); + list_add(&new->xpt_list, &serv->sv_permsocks); + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(new); +} + int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, struct net *net, const int family, const unsigned short port, int flags) @@ -232,13 +241,8 @@ int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, module_put(xcl->xcl_owner); return PTR_ERR(newxprt); } - - clear_bit(XPT_TEMP, &newxprt->xpt_flags); - spin_lock_bh(&serv->sv_lock); - list_add(&newxprt->xpt_list, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); + svc_add_new_perm_xprt(serv, newxprt); newport = svc_xprt_local_port(newxprt); - svc_xprt_received(newxprt); return newport; } err: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index bf10b723f429..c7a7b14f54ed 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1478,11 +1478,7 @@ int svc_addsock(struct svc_serv *serv, const int fd, char *name_return, } if (kernel_getsockname(svsk->sk_sock, sin, &salen) == 0) svc_xprt_set_local(&svsk->sk_xprt, sin, salen); - clear_bit(XPT_TEMP, &svsk->sk_xprt.xpt_flags); - spin_lock_bh(&serv->sv_lock); - list_add(&svsk->sk_xprt.xpt_list, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(&svsk->sk_xprt); + svc_add_new_perm_xprt(serv, &svsk->sk_xprt); return svc_one_sock_name(svsk, name_return, len); out: sockfd_put(so); -- cgit v1.2.3 From f23abfdb94fda3108441530cb4a813088d3f9176 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 20:32:27 -0400 Subject: svcrpc: minor udp code cleanup Order the code in a more boring way. Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c7a7b14f54ed..06ae8a755349 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -620,10 +620,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (!svc_udp_get_dest_address(rqstp, cmh)) { net_warn_ratelimited("svc: received unknown control message %d/%d; dropping RPC reply datagram\n", cmh->cmsg_level, cmh->cmsg_type); -out_free: - trace_kfree_skb(skb, svc_udp_recvfrom); - skb_free_datagram_locked(svsk->sk_sk, skb); - return 0; + goto out_free; } rqstp->rq_daddrlen = svc_addr_len(svc_daddr(rqstp)); @@ -662,6 +659,10 @@ out_free: serv->sv_stats->netudpcnt++; return len; +out_free: + trace_kfree_skb(skb, svc_udp_recvfrom); + skb_free_datagram_locked(svsk->sk_sk, skb); + return 0; } static int -- cgit v1.2.3 From af6d572134b012ca92c4efc8a2f1cadbe5d01064 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 21 Aug 2012 17:22:11 -0400 Subject: svcrpc: don't bother checking bad svc_addr_len result None of the callers should see an unsupported address family (only one of them even bothers to check for that case), so just check for the buggy case in svc_addr_len and don't bother elsewhere. Acked-by: Chuck Lever Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 06ae8a755349..406688baac57 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -601,8 +601,6 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) return -EAGAIN; } len = svc_addr_len(svc_addr(rqstp)); - if (len == 0) - return -EAFNOSUPPORT; rqstp->rq_addrlen = len; if (skb->tstamp.tv64 == 0) { skb->tstamp = ktime_get_real(); -- cgit v1.2.3 From 9f9d2ebe693a98d517257e1a39f61120b4473b96 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 21:35:24 -0400 Subject: svcrpc: make xpo_recvfrom return only >=0 The only errors returned from xpo_recvfrom have been -EAGAIN and -EAFNOSUPPORT. The latter was removed by a previous patch. That leaves only -EAGAIN, which is treated just like 0 by the caller (svc_recv). So, just ditch -EAGAIN and return 0 instead. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 2 +- net/sunrpc/svcsock.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index ee15663798b3..3e317307e288 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -743,7 +743,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_received(xprt); /* No data, incomplete (TCP) read, or accept() */ - if (len == 0 || len == -EAGAIN) + if (len <= 0) goto out; clear_bit(XPT_OLD, &xprt->xpt_flags); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 406688baac57..7aee54c3fe46 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -598,7 +598,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) dprintk("svc: recvfrom returned error %d\n", -err); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); } - return -EAGAIN; + return 0; } len = svc_addr_len(svc_addr(rqstp)); rqstp->rq_addrlen = len; @@ -1174,13 +1174,13 @@ error: if (len != -EAGAIN) goto err_other; dprintk("RPC: TCP recvfrom got EAGAIN\n"); - return -EAGAIN; + return 0; err_other: printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", svsk->sk_xprt.xpt_server->sv_name, -len); set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); err_noclose: - return -EAGAIN; /* record not complete */ + return 0; /* record not complete */ } /* -- cgit v1.2.3 From 6741019c829ecfa6f7a504fae1305dcf5d5cf057 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 17 Aug 2012 22:12:19 -0400 Subject: svcrpc: make svc_xprt_received static Note this isn't used outside svc_xprt.c. May as well move it so we don't need a declaration while we're here. Also remove an outdated comment. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 41 ++++++++++++++++---------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 4 ---- 2 files changed, 20 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 3e317307e288..295e6ed21ca0 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -208,6 +208,26 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, return xcl->xcl_ops->xpo_create(serv, net, sap, len, flags); } +/* + * svc_xprt_received conditionally queues the transport for processing + * by another thread. The caller must hold the XPT_BUSY bit and must + * not thereafter touch transport data. + * + * Note: XPT_DATA only gets cleared when a read-attempt finds no (or + * insufficient) data. + */ +static void svc_xprt_received(struct svc_xprt *xprt) +{ + BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); + /* As soon as we clear busy, the xprt could be closed and + * 'put', so we need a reference to call svc_xprt_enqueue with: + */ + svc_xprt_get(xprt); + clear_bit(XPT_BUSY, &xprt->xpt_flags); + svc_xprt_enqueue(xprt); + svc_xprt_put(xprt); +} + void svc_add_new_perm_xprt(struct svc_serv *serv, struct svc_xprt *new) { clear_bit(XPT_TEMP, &new->xpt_flags); @@ -398,27 +418,6 @@ static struct svc_xprt *svc_xprt_dequeue(struct svc_pool *pool) return xprt; } -/* - * svc_xprt_received conditionally queues the transport for processing - * by another thread. The caller must hold the XPT_BUSY bit and must - * not thereafter touch transport data. - * - * Note: XPT_DATA only gets cleared when a read-attempt finds no (or - * insufficient) data. - */ -void svc_xprt_received(struct svc_xprt *xprt) -{ - BUG_ON(!test_bit(XPT_BUSY, &xprt->xpt_flags)); - /* As soon as we clear busy, the xprt could be closed and - * 'put', so we need a reference to call svc_xprt_enqueue with: - */ - svc_xprt_get(xprt); - clear_bit(XPT_BUSY, &xprt->xpt_flags); - svc_xprt_enqueue(xprt); - svc_xprt_put(xprt); -} -EXPORT_SYMBOL_GPL(svc_xprt_received); - /** * svc_reserve - change the space reserved for the reply to a request. * @rqstp: The request in question diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 73b428bef598..62e4f9bcc387 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -578,10 +578,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird) list_add_tail(&newxprt->sc_accept_q, &listen_xprt->sc_accept_q); spin_unlock_bh(&listen_xprt->sc_lock); - /* - * Can't use svc_xprt_received here because we are not on a - * rqstp thread - */ set_bit(XPT_CONN, &listen_xprt->sc_xprt.xpt_flags); svc_xprt_enqueue(&listen_xprt->sc_xprt); } -- cgit v1.2.3 From 6797fa5a018ff916a071c6265fbf043644abcd29 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 18 Aug 2012 15:33:51 -0400 Subject: svcrpc: break up svc_recv Matter of taste, I suppose, but svc_recv breaks up naturally into: allocate pages and setup arg dequeue (wait for, if necessary) next socket do something with that socket And I find it easier to read when it doesn't go on for pages and pages. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 103 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 67 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 295e6ed21ca0..6ebc9a95bbab 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -568,33 +568,12 @@ static void svc_check_conn_limits(struct svc_serv *serv) } } -/* - * Receive the next request on any transport. This code is carefully - * organised not to touch any cachelines in the shared svc_serv - * structure, only cachelines in the local svc_pool. - */ -int svc_recv(struct svc_rqst *rqstp, long timeout) +int svc_alloc_arg(struct svc_rqst *rqstp) { - struct svc_xprt *xprt = NULL; - struct svc_serv *serv = rqstp->rq_server; - struct svc_pool *pool = rqstp->rq_pool; - int len, i; - int pages; - struct xdr_buf *arg; - DECLARE_WAITQUEUE(wait, current); - long time_left; - - dprintk("svc: server %p waiting for data (to = %ld)\n", - rqstp, timeout); - - if (rqstp->rq_xprt) - printk(KERN_ERR - "svc_recv: service %p, transport not NULL!\n", - rqstp); - if (waitqueue_active(&rqstp->rq_wait)) - printk(KERN_ERR - "svc_recv: service %p, wait queue active!\n", - rqstp); + struct svc_serv *serv = rqstp->rq_server; + struct xdr_buf *arg; + int pages; + int i; /* now allocate needed pages. If we get a failure, sleep briefly */ pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE; @@ -624,11 +603,15 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) arg->page_len = (pages-2)*PAGE_SIZE; arg->len = (pages-1)*PAGE_SIZE; arg->tail[0].iov_len = 0; + return 0; +} - try_to_freeze(); - cond_resched(); - if (signalled() || kthread_should_stop()) - return -EINTR; +struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt; + struct svc_pool *pool = rqstp->rq_pool; + DECLARE_WAITQUEUE(wait, current); + long time_left; /* Normally we will wait up to 5 seconds for any required * cache information to be provided. @@ -666,7 +649,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (kthread_should_stop()) { set_current_state(TASK_RUNNING); spin_unlock_bh(&pool->sp_lock); - return -EINTR; + return ERR_PTR(-EINTR); } add_wait_queue(&rqstp->rq_wait, &wait); @@ -687,19 +670,25 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) spin_unlock_bh(&pool->sp_lock); dprintk("svc: server %p, no data yet\n", rqstp); if (signalled() || kthread_should_stop()) - return -EINTR; + return ERR_PTR(-EINTR); else - return -EAGAIN; + return ERR_PTR(-EAGAIN); } } spin_unlock_bh(&pool->sp_lock); + return xprt; +} + +static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) +{ + struct svc_serv *serv = rqstp->rq_server; + int len = 0; - len = 0; if (test_bit(XPT_CLOSE, &xprt->xpt_flags)) { dprintk("svc_recv: found XPT_CLOSE\n"); svc_delete_xprt(xprt); /* Leave XPT_BUSY set on the dead xprt: */ - goto out; + return 0; } if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; @@ -727,8 +716,9 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) svc_xprt_received(newxpt); } } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { + /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", - rqstp, pool->sp_id, xprt, + rqstp, rqstp->rq_pool->sp_id, xprt, atomic_read(&xprt->xpt_ref.refcount)); rqstp->rq_deferred = svc_deferred_dequeue(xprt); if (rqstp->rq_deferred) @@ -739,7 +729,48 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); } + /* clear XPT_BUSY: */ svc_xprt_received(xprt); + return len; +} + +/* + * Receive the next request on any transport. This code is carefully + * organised not to touch any cachelines in the shared svc_serv + * structure, only cachelines in the local svc_pool. + */ +int svc_recv(struct svc_rqst *rqstp, long timeout) +{ + struct svc_xprt *xprt = NULL; + struct svc_serv *serv = rqstp->rq_server; + int len, err; + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_xprt) + printk(KERN_ERR + "svc_recv: service %p, transport not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + err = svc_alloc_arg(rqstp); + if (err) + return err; + + try_to_freeze(); + cond_resched(); + if (signalled() || kthread_should_stop()) + return -EINTR; + + xprt = svc_get_next_xprt(rqstp, timeout); + if (IS_ERR(xprt)) + return PTR_ERR(xprt); + + len = svc_handle_xprt(rqstp, xprt); /* No data, incomplete (TCP) read, or accept() */ if (len <= 0) -- cgit v1.2.3 From 65b2e6656bda2ad983727fcc725ac66b6d5035a7 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Sat, 18 Aug 2012 15:44:33 -0400 Subject: svcrpc: split up svc_handle_xprt Move initialization of newly accepted socket into a helper. Signed-off-by: J. Bruce Fields --- net/sunrpc/svc_xprt.c | 47 +++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6ebc9a95bbab..194d865fae72 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -679,6 +679,23 @@ struct svc_xprt *svc_get_next_xprt(struct svc_rqst *rqstp, long timeout) return xprt; } +void svc_add_new_temp_xprt(struct svc_serv *serv, struct svc_xprt *newxpt) +{ + spin_lock_bh(&serv->sv_lock); + set_bit(XPT_TEMP, &newxpt->xpt_flags); + list_add(&newxpt->xpt_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + if (serv->sv_temptimer.function == NULL) { + /* setup timer to age temp transports */ + setup_timer(&serv->sv_temptimer, svc_age_temp_xprts, + (unsigned long)serv); + mod_timer(&serv->sv_temptimer, + jiffies + svc_conn_age_period * HZ); + } + spin_unlock_bh(&serv->sv_lock); + svc_xprt_received(newxpt); +} + static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) { struct svc_serv *serv = rqstp->rq_server; @@ -692,29 +709,15 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) } if (test_bit(XPT_LISTENER, &xprt->xpt_flags)) { struct svc_xprt *newxpt; + /* + * We know this module_get will succeed because the + * listener holds a reference too + */ + __module_get(xprt->xpt_class->xcl_owner); + svc_check_conn_limits(xprt->xpt_server); newxpt = xprt->xpt_ops->xpo_accept(xprt); - if (newxpt) { - /* - * We know this module_get will succeed because the - * listener holds a reference too - */ - __module_get(newxpt->xpt_class->xcl_owner); - svc_check_conn_limits(xprt->xpt_server); - spin_lock_bh(&serv->sv_lock); - set_bit(XPT_TEMP, &newxpt->xpt_flags); - list_add(&newxpt->xpt_list, &serv->sv_tempsocks); - serv->sv_tmpcnt++; - if (serv->sv_temptimer.function == NULL) { - /* setup timer to age temp transports */ - setup_timer(&serv->sv_temptimer, - svc_age_temp_xprts, - (unsigned long)serv); - mod_timer(&serv->sv_temptimer, - jiffies + svc_conn_age_period * HZ); - } - spin_unlock_bh(&serv->sv_lock); - svc_xprt_received(newxpt); - } + if (newxpt) + svc_add_new_temp_xprt(serv, newxpt); } else if (xprt->xpt_ops->xpo_has_wspace(xprt)) { /* XPT_DATA|XPT_DEFERRED case: */ dprintk("svc: server %p, pool %u, transport %p, inuse=%d\n", -- cgit v1.2.3 From 43def35c1030d91a7414936c7c1b416828b20afb Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Fri, 10 Aug 2012 15:52:06 +0200 Subject: net/9p: Check errno validity While working on a modified server I had the Linux clients crash a few times. This lead me to find this: Some error codes are directly extracted from the server replies. A malformed server reply could contain an invalid error code, with a very large value. If this value is then passed to ERR_PTR() it will not be properly detected as an error code by IS_ERR() and as a result the kernel will dereference an invalid pointer. This patch tries to avoid this. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 8260f132b32e..34d417670935 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -76,6 +76,20 @@ inline int p9_is_proto_dotu(struct p9_client *clnt) } EXPORT_SYMBOL(p9_is_proto_dotu); +/* + * Some error codes are taken directly from the server replies, + * make sure they are valid. + */ +static int safe_errno(int err) +{ + if ((err > 0) || (err < -MAX_ERRNO)) { + p9_debug(P9_DEBUG_ERROR, "Invalid error code %d\n", err); + return -EPROTO; + } + return err; +} + + /* Interpret mount option for protocol version */ static int get_protocol_version(char *s) { @@ -782,7 +796,7 @@ again: return req; reterr: p9_free_req(c, req); - return ERR_PTR(err); + return ERR_PTR(safe_errno(err)); } /** @@ -865,7 +879,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, return req; reterr: p9_free_req(c, req); - return ERR_PTR(err); + return ERR_PTR(safe_errno(err)); } static struct p9_fid *p9_fid_create(struct p9_client *clnt) -- cgit v1.2.3 From eccf50c129686de11358093839749c83f6cae5db Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 15 Aug 2012 18:07:43 -0400 Subject: nfsd: remove unused listener-removal interfaces You can use nfsd/portlist to give nfsd additional sockets to listen on. In theory you can also remove listening sockets this way. But nobody's ever done that as far as I can tell. Also this was partially broken in 2.6.25, by a217813f9067b785241cb7f31956e51d2071703a "knfsd: Support adding transports by writing portlist file". (Note that we decide whether to take the "delfd" case by checking for a digit--but what's actually expected in that case is something made by svc_one_sock_name(), which won't begin with a digit.) So, let's just rip out this stuff. Acked-by: NeilBrown Signed-off-by: J. Bruce Fields --- net/sunrpc/svcsock.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'net') diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 7aee54c3fe46..03827cef1fa7 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -305,57 +305,6 @@ static int svc_one_sock_name(struct svc_sock *svsk, char *buf, int remaining) return len; } -/** - * svc_sock_names - construct a list of listener names in a string - * @serv: pointer to RPC service - * @buf: pointer to a buffer to fill in with socket names - * @buflen: size of the buffer to be filled - * @toclose: pointer to '\0'-terminated C string containing the name - * of a listener to be closed - * - * Fills in @buf with a '\n'-separated list of names of listener - * sockets. If @toclose is not NULL, the socket named by @toclose - * is closed, and is not included in the output list. - * - * Returns positive length of the socket name string, or a negative - * errno value on error. - */ -int svc_sock_names(struct svc_serv *serv, char *buf, const size_t buflen, - const char *toclose) -{ - struct svc_sock *svsk, *closesk = NULL; - int len = 0; - - if (!serv) - return 0; - - spin_lock_bh(&serv->sv_lock); - list_for_each_entry(svsk, &serv->sv_permsocks, sk_xprt.xpt_list) { - int onelen = svc_one_sock_name(svsk, buf + len, buflen - len); - if (onelen < 0) { - len = onelen; - break; - } - if (toclose && strcmp(toclose, buf + len) == 0) { - closesk = svsk; - svc_xprt_get(&closesk->sk_xprt); - } else - len += onelen; - } - spin_unlock_bh(&serv->sv_lock); - - if (closesk) { - /* Should unregister with portmap, but you cannot - * unregister just one protocol... - */ - svc_close_xprt(&closesk->sk_xprt); - svc_xprt_put(&closesk->sk_xprt); - } else if (toclose) - return -ENOENT; - return len; -} -EXPORT_SYMBOL_GPL(svc_sock_names); - /* * Check input queue length */ -- cgit v1.2.3 From 0462194d358c2e040282d4d1a4fd1aab84417e42 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:28 +0200 Subject: 9P: Fix race in p9_read_work() Race scenario between p9_read_work() and p9_poll_mux() Data arrive, Rworksched is set, p9_read_work() is called. thread A thread B p9_read_work() . reads data . checks if new data ready. No. . gets preempted . More data arrive, p9_poll_mux() is called. . . . p9_poll_mux() . . if (!test_and_set_bit(Rworksched, . &m->wsched)) { . schedule_work(&m->rq); . } . . -> does not schedule work because . Rworksched is set . . clear_bit(Rworksched, &m->wsched); return; No work has been scheduled, and yet data are waiting. Currently p9_read_work() checks if there is data to read, and if not, it clears Rworksched. I think it should clear Rworksched first, and then check if there is data to read. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 6449bae15702..de1bbad0c7de 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -316,8 +316,7 @@ static void p9_read_work(struct work_struct *work) m->rsize - m->rpos); p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); if (err == -EAGAIN) { - clear_bit(Rworksched, &m->wsched); - return; + goto end_clear; } if (err <= 0) @@ -379,19 +378,20 @@ static void p9_read_work(struct work_struct *work) m->req = NULL; } +end_clear: + clear_bit(Rworksched, &m->wsched); + if (!list_empty(&m->req_list)) { if (test_and_clear_bit(Rpending, &m->wsched)) n = POLLIN; else n = p9_fd_poll(m->client, NULL); - if (n & POLLIN) { + if ((n & POLLIN) && !test_and_set_bit(Rworksched, &m->wsched)) { p9_debug(P9_DEBUG_TRANS, "sched read work %p\n", m); schedule_work(&m->rq); - } else - clear_bit(Rworksched, &m->wsched); - } else - clear_bit(Rworksched, &m->wsched); + } + } return; error: -- cgit v1.2.3 From 1957b3a86f8eb5ceab32e3aae99e2822258aa530 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:29 +0200 Subject: 9P: fix test at the end of p9_write_work() At the end of p9_write_work() we want to test if there is still data to send. This means: - either the current request still has data to send (wsize != 0) - or there are requests in the unsent queue Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index de1bbad0c7de..7088a94b2601 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -492,7 +492,7 @@ static void p9_write_work(struct work_struct *work) if (m->wpos == m->wsize) m->wpos = m->wsize = 0; - if (m->wsize == 0 && !list_empty(&m->unsent_req_list)) { + if (m->wsize || !list_empty(&m->unsent_req_list)) { if (test_and_clear_bit(Wpending, &m->wsched)) n = POLLOUT; else -- cgit v1.2.3 From 584a8c13d58423462680907d4cc40d9929c9030a Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:30 +0200 Subject: 9P: Fix race in p9_write_work() See previous commit about p9_read_work() for details. This fixes a similar race between p9_write_work() and p9_poll_mux() Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 7088a94b2601..b2c308fffb8a 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -476,10 +476,9 @@ static void p9_write_work(struct work_struct *work) clear_bit(Wpending, &m->wsched); err = p9_fd_write(m->client, m->wbuf + m->wpos, m->wsize - m->wpos); p9_debug(P9_DEBUG_TRANS, "mux %p sent %d bytes\n", m, err); - if (err == -EAGAIN) { - clear_bit(Wworksched, &m->wsched); - return; - } + if (err == -EAGAIN) + goto end_clear; + if (err < 0) goto error; @@ -492,19 +491,21 @@ static void p9_write_work(struct work_struct *work) if (m->wpos == m->wsize) m->wpos = m->wsize = 0; +end_clear: + clear_bit(Wworksched, &m->wsched); + if (m->wsize || !list_empty(&m->unsent_req_list)) { if (test_and_clear_bit(Wpending, &m->wsched)) n = POLLOUT; else n = p9_fd_poll(m->client, NULL); - if (n & POLLOUT) { + if ((n & POLLOUT) && + !test_and_set_bit(Wworksched, &m->wsched)) { p9_debug(P9_DEBUG_TRANS, "sched write work %p\n", m); schedule_work(&m->wq); - } else - clear_bit(Wworksched, &m->wsched); - } else - clear_bit(Wworksched, &m->wsched); + } + } return; -- cgit v1.2.3 From a519fc7a70d1a918574bb826cc6905b87b482eb9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Sep 2012 16:49:15 -0400 Subject: SUNRPC: Ensure that the TCP socket is closed when in CLOSE_WAIT Instead of doing a shutdown() call, we need to do an actual close(). Ditto if/when the server is sending us junk RPC headers. Signed-off-by: Trond Myklebust Tested-by: Simon Kirby Cc: stable@vger.kernel.org --- net/sunrpc/xprtsock.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index a35b8e52e551..d1988cf8bf33 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1025,6 +1025,16 @@ static void xs_udp_data_ready(struct sock *sk, int len) read_unlock_bh(&sk->sk_callback_lock); } +/* + * Helper function to force a TCP close if the server is sending + * junk and/or it has put us in CLOSE_WAIT + */ +static void xs_tcp_force_close(struct rpc_xprt *xprt) +{ + set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); + xprt_force_disconnect(xprt); +} + static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1051,7 +1061,7 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_rea /* Sanity check of the record length */ if (unlikely(transport->tcp_reclen < 8)) { dprintk("RPC: invalid TCP record fragment length\n"); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); return; } dprintk("RPC: reading TCP record fragment of length %d\n", @@ -1132,7 +1142,7 @@ static inline void xs_tcp_read_calldir(struct sock_xprt *transport, break; default: dprintk("RPC: invalid request message type\n"); - xprt_force_disconnect(&transport->xprt); + xs_tcp_force_close(&transport->xprt); } xs_tcp_check_fraghdr(transport); } @@ -1455,6 +1465,8 @@ static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt) static void xs_sock_mark_closed(struct rpc_xprt *xprt) { smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); + clear_bit(XPRT_CONNECTION_CLOSE, &xprt->state); clear_bit(XPRT_CLOSE_WAIT, &xprt->state); clear_bit(XPRT_CLOSING, &xprt->state); smp_mb__after_clear_bit(); @@ -1512,8 +1524,8 @@ static void xs_tcp_state_change(struct sock *sk) break; case TCP_CLOSE_WAIT: /* The server initiated a shutdown of the socket */ - xprt_force_disconnect(xprt); xprt->connect_cookie++; + xs_tcp_force_close(xprt); case TCP_CLOSING: /* * If the server closed down the connection, make sure that @@ -2199,8 +2211,7 @@ static void xs_tcp_setup_socket(struct work_struct *work) /* We're probably in TIME_WAIT. Get rid of existing socket, * and retry */ - set_bit(XPRT_CONNECTION_CLOSE, &xprt->state); - xprt_force_disconnect(xprt); + xs_tcp_force_close(xprt); break; case -ECONNREFUSED: case -ECONNRESET: -- cgit v1.2.3 From 84e28a307e376f271505af65a7b7e212dd6f61f4 Mon Sep 17 00:00:00 2001 From: Bryan Schumaker Date: Mon, 24 Sep 2012 13:39:01 -0400 Subject: SUNRPC: Set alloc_slot for backchannel tcp ops f39c1bfb5a03e2d255451bff05be0d7255298fa4 (SUNRPC: Fix a UDP transport regression) introduced the "alloc_slot" function for xprt operations, but never created one for the backchannel operations. This patch fixes a null pointer dereference when mounting NFS over v4.1. Call Trace: [] ? xprt_reserve+0x47/0x50 [sunrpc] [] call_reserve+0x34/0x60 [sunrpc] [] __rpc_execute+0x90/0x400 [sunrpc] [] rpc_async_schedule+0x2a/0x40 [sunrpc] [] process_one_work+0x139/0x500 [] ? alloc_worker+0x70/0x70 [] ? __rpc_execute+0x400/0x400 [sunrpc] [] worker_thread+0x15e/0x460 [] ? preempt_schedule+0x49/0x70 [] ? rescuer_thread+0x230/0x230 [] kthread+0x93/0xa0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x70/0x70 [] ? gs_change+0x13/0x13 Signed-off-by: Bryan Schumaker Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d1988cf8bf33..97f8918169ed 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2539,6 +2539,7 @@ static struct rpc_xprt_ops xs_tcp_ops = { static struct rpc_xprt_ops bc_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .rpcbind = xs_local_rpcbind, .buf_alloc = bc_malloc, .buf_free = bc_free, -- cgit v1.2.3 From 8a9a8b8332b92b13316cf49685b5dc5257cfe115 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 1 Aug 2012 14:32:13 -0400 Subject: SUNRPC: Fix the return value of xdr_align_pages() The callers of xdr_align_pages() expect it to return the number of bytes of actual XDR data remaining in the pages. Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 0afba1b4b656..fbbd1c475b43 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -742,6 +742,8 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) /* Truncate page data and move it into the tail */ if (buf->page_len > len) xdr_shrink_pagelen(buf, buf->page_len - len); + else + len = buf->page_len; xdr->nwords = XDR_QUADLEN(buf->len - cur); return len; } -- cgit v1.2.3 From 28407630513b1a86133db0ef8b39fabad6c494af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 17 Aug 2012 23:54:15 -0400 Subject: take descriptor handling from sock_alloc_file() to callers Signed-off-by: Al Viro --- net/socket.c | 62 +++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 40 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index edc3c4af9085..a14ec19164b6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -346,22 +346,15 @@ static struct file_system_type sock_fs_type = { * but we take care of internal coherence yet. */ -static int sock_alloc_file(struct socket *sock, struct file **f, int flags) +static struct file *sock_alloc_file(struct socket *sock, int flags) { struct qstr name = { .name = "" }; struct path path; struct file *file; - int fd; - - fd = get_unused_fd_flags(flags); - if (unlikely(fd < 0)) - return fd; path.dentry = d_alloc_pseudo(sock_mnt->mnt_sb, &name); - if (unlikely(!path.dentry)) { - put_unused_fd(fd); - return -ENOMEM; - } + if (unlikely(!path.dentry)) + return ERR_PTR(-ENOMEM); path.mnt = mntget(sock_mnt); d_instantiate(path.dentry, SOCK_INODE(sock)); @@ -373,28 +366,31 @@ static int sock_alloc_file(struct socket *sock, struct file **f, int flags) /* drop dentry, keep inode */ ihold(path.dentry->d_inode); path_put(&path); - put_unused_fd(fd); - return -ENFILE; + return ERR_PTR(-ENFILE); } sock->file = file; file->f_flags = O_RDWR | (flags & O_NONBLOCK); file->f_pos = 0; file->private_data = sock; - - *f = file; - return fd; + return file; } int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; - int fd = sock_alloc_file(sock, &newfile, flags); + int fd = get_unused_fd_flags(flags); + if (unlikely(fd < 0)) + return fd; - if (likely(fd >= 0)) + newfile = sock_alloc_file(sock, flags); + if (likely(!IS_ERR(newfile))) { fd_install(fd, newfile); + return fd; + } - return fd; + put_unused_fd(fd); + return PTR_ERR(newfile); } EXPORT_SYMBOL(sock_map_fd); @@ -1394,17 +1390,32 @@ SYSCALL_DEFINE4(socketpair, int, family, int, type, int, protocol, if (err < 0) goto out_release_both; - fd1 = sock_alloc_file(sock1, &newfile1, flags); + fd1 = get_unused_fd_flags(flags); if (unlikely(fd1 < 0)) { err = fd1; goto out_release_both; } - - fd2 = sock_alloc_file(sock2, &newfile2, flags); + fd2 = get_unused_fd_flags(flags); if (unlikely(fd2 < 0)) { err = fd2; + put_unused_fd(fd1); + goto out_release_both; + } + + newfile1 = sock_alloc_file(sock1, flags); + if (unlikely(IS_ERR(newfile1))) { + err = PTR_ERR(newfile1); + put_unused_fd(fd1); + put_unused_fd(fd2); + goto out_release_both; + } + + newfile2 = sock_alloc_file(sock2, flags); + if (IS_ERR(newfile2)) { + err = PTR_ERR(newfile2); fput(newfile1); put_unused_fd(fd1); + put_unused_fd(fd2); sock_release(sock2); goto out; } @@ -1536,12 +1547,19 @@ SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr, */ __module_get(newsock->ops->owner); - newfd = sock_alloc_file(newsock, &newfile, flags); + newfd = get_unused_fd_flags(flags); if (unlikely(newfd < 0)) { err = newfd; sock_release(newsock); goto out_put; } + newfile = sock_alloc_file(newsock, flags); + if (unlikely(IS_ERR(newfile))) { + err = PTR_ERR(newfile); + put_unused_fd(newfd); + sock_release(newsock); + goto out_put; + } err = security_socket_accept(sock, newsock); if (err) -- cgit v1.2.3 From 56b31d1c9f1e6a3ad92e7bfe252721e05d92b285 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Aug 2012 00:25:51 -0400 Subject: unexport sock_map_fd(), switch to sock_alloc_file() Both modular callers of sock_map_fd() had been buggy; sctp one leaks descriptor and file if copy_to_user() fails, 9p one shouldn't be exposing file in the descriptor table at all. Switch both to sock_alloc_file(), export it, unexport sock_map_fd() and make it static. Signed-off-by: Al Viro --- net/9p/trans_fd.c | 16 +++++++--------- net/sctp/socket.c | 25 ++++++++++++++++++++----- net/socket.c | 6 +++--- 3 files changed, 30 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 6449bae15702..8c4e0b538a8a 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -793,30 +793,28 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd) static int p9_socket_open(struct p9_client *client, struct socket *csocket) { struct p9_trans_fd *p; - int ret, fd; + struct file *file; + int ret; p = kmalloc(sizeof(struct p9_trans_fd), GFP_KERNEL); if (!p) return -ENOMEM; csocket->sk->sk_allocation = GFP_NOIO; - fd = sock_map_fd(csocket, 0); - if (fd < 0) { + file = sock_alloc_file(csocket, 0); + if (IS_ERR(file)) { pr_err("%s (%d): failed to map fd\n", __func__, task_pid_nr(current)); sock_release(csocket); kfree(p); - return fd; + return PTR_ERR(file); } - get_file(csocket->file); - get_file(csocket->file); - p->wr = p->rd = csocket->file; + get_file(file); + p->wr = p->rd = file; client->trans = p; client->status = Connected; - sys_close(fd); /* still racy */ - p->rd->f_flags |= O_NONBLOCK; p->conn = p9_conn_create(client); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5e259817a7f3..fb5931ca50d0 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -4276,6 +4277,7 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval { sctp_peeloff_arg_t peeloff; struct socket *newsock; + struct file *newfile; int retval = 0; if (len < sizeof(sctp_peeloff_arg_t)) @@ -4289,22 +4291,35 @@ static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval goto out; /* Map the socket to an unused fd that can be returned to the user. */ - retval = sock_map_fd(newsock, 0); + retval = get_unused_fd(); if (retval < 0) { sock_release(newsock); goto out; } + newfile = sock_alloc_file(newsock, 0); + if (unlikely(IS_ERR(newfile))) { + put_unused_fd(retval); + sock_release(newsock); + return PTR_ERR(newfile); + } + SCTP_DEBUG_PRINTK("%s: sk: %p newsk: %p sd: %d\n", __func__, sk, newsock->sk, retval); /* Return the fd mapped to the new socket. */ + if (put_user(len, optlen)) { + fput(newfile); + put_unused_fd(retval); + return -EFAULT; + } peeloff.sd = retval; - if (put_user(len, optlen)) + if (copy_to_user(optval, &peeloff, len)) { + fput(newfile); + put_unused_fd(retval); return -EFAULT; - if (copy_to_user(optval, &peeloff, len)) - retval = -EFAULT; - + } + fd_install(retval, newfile); out: return retval; } diff --git a/net/socket.c b/net/socket.c index a14ec19164b6..38a14311f3a6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -346,7 +346,7 @@ static struct file_system_type sock_fs_type = { * but we take care of internal coherence yet. */ -static struct file *sock_alloc_file(struct socket *sock, int flags) +struct file *sock_alloc_file(struct socket *sock, int flags) { struct qstr name = { .name = "" }; struct path path; @@ -375,8 +375,9 @@ static struct file *sock_alloc_file(struct socket *sock, int flags) file->private_data = sock; return file; } +EXPORT_SYMBOL(sock_alloc_file); -int sock_map_fd(struct socket *sock, int flags) +static int sock_map_fd(struct socket *sock, int flags) { struct file *newfile; int fd = get_unused_fd_flags(flags); @@ -392,7 +393,6 @@ int sock_map_fd(struct socket *sock, int flags) put_unused_fd(fd); return PTR_ERR(newfile); } -EXPORT_SYMBOL(sock_map_fd); struct socket *sock_from_file(struct file *file, int *err) { -- cgit v1.2.3 From c3c073f808b22dfae15ef8412b6f7b998644139a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Aug 2012 22:32:06 -0400 Subject: new helper: iterate_fd() iterates through the opened files in given descriptor table, calling a supplied function; we stop once non-zero is returned. Callback gets struct file *, descriptor number and const void * argument passed to iterator. It is called with files->file_lock held, so it is not allowed to block. tty_io, netprio_cgroup and selinux flush_unauthorized_files() converted to its use. Signed-off-by: Al Viro --- net/core/netprio_cgroup.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index c75e3f9d060f..5ffd084c6a83 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -272,38 +272,24 @@ out_free_devname: return ret; } +static int update_netprio(const void *v, struct file *file, unsigned n) +{ + int err; + struct socket *sock = sock_from_file(file, &err); + if (sock) + sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v; + return 0; +} + void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { struct task_struct *p; + void *v; cgroup_taskset_for_each(p, cgrp, tset) { - unsigned int fd; - struct fdtable *fdt; - struct files_struct *files; - task_lock(p); - files = p->files; - if (!files) { - task_unlock(p); - continue; - } - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - for (fd = 0; fd < fdt->max_fds; fd++) { - struct file *file; - struct socket *sock; - int err; - - file = fcheck_files(files, fd); - if (!file) - continue; - - sock = sock_from_file(file, &err); - if (sock) - sock_update_netprioidx(sock->sk, p); - } - spin_unlock(&files->file_lock); + v = (void *)(unsigned long)task_netprioidx(p); + iterate_fd(p->files, 0, update_netprio, v); task_unlock(p); } } -- cgit v1.2.3 From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Aug 2012 14:48:26 -0400 Subject: make get_file() return its argument simplifies a bunch of callers... Signed-off-by: Al Viro --- net/compat.c | 3 +-- net/core/scm.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/compat.c b/net/compat.c index 74ed1d7a84a2..79ae88485001 100644 --- a/net/compat.c +++ b/net/compat.c @@ -301,8 +301,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) break; } /* Bump the usage count and install the file. */ - get_file(fp[i]); - fd_install(new_fd, fp[i]); + fd_install(new_fd, get_file(fp[i])); } if (i > 0) { diff --git a/net/core/scm.c b/net/core/scm.c index 040cebeed45b..b0098d259233 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -281,11 +281,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) break; } /* Bump the usage count and install the file. */ - get_file(fp[i]); sock = sock_from_file(fp[i], &err); if (sock) sock_update_netprioidx(sock->sk, current); - fd_install(new_fd, fp[i]); + fd_install(new_fd, get_file(fp[i])); } if (i > 0) -- cgit v1.2.3 From a11a2bf4de5679fa0b63474c7d39bea2dac7d061 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 2 Aug 2012 13:21:43 -0400 Subject: SUNRPC: Optimise away unnecessary data moves in xdr_align_pages We only have to call xdr_shrink_pagelen() if the remaining RPC message does not fit in the page buffer length that we supplied to xdr_align_pages(). Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index fbbd1c475b43..08f50afd5f2a 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -730,21 +730,24 @@ static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len) if (xdr->nwords == 0) return 0; - if (nwords > xdr->nwords) { - nwords = xdr->nwords; - len = nwords << 2; - } /* Realign pages to current pointer position */ iov = buf->head; - if (iov->iov_len > cur) + if (iov->iov_len > cur) { xdr_shrink_bufhead(buf, iov->iov_len - cur); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + } - /* Truncate page data and move it into the tail */ - if (buf->page_len > len) - xdr_shrink_pagelen(buf, buf->page_len - len); - else + if (nwords > xdr->nwords) { + nwords = xdr->nwords; + len = nwords << 2; + } + if (buf->page_len <= len) len = buf->page_len; - xdr->nwords = XDR_QUADLEN(buf->len - cur); + else if (nwords < xdr->nwords) { + /* Truncate page data and move it into the tail */ + xdr_shrink_pagelen(buf, buf->page_len - len); + xdr->nwords = XDR_QUADLEN(buf->len - cur); + } return len; } -- cgit v1.2.3 From d19751e7b9bd8a01d00372325439589886674f79 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Sep 2012 17:21:25 -0400 Subject: SUNRPC: Get rid of the redundant xprt->shutdown bit field It is only set after everyone has dereferenced the transport, and serves no useful purpose: setting it is racy, so all the socket code, etc still needs to be able to cope with the cases where they miss reading it. Signed-off-by: Trond Myklebust --- net/sunrpc/xprt.c | 8 ++------ net/sunrpc/xprtrdma/transport.c | 22 ++++++++-------------- net/sunrpc/xprtsock.c | 18 ------------------ 3 files changed, 10 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 5d7f61d7559c..bd462a532acf 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL_GPL(xprt_reserve_xprt); static void xprt_clear_locked(struct rpc_xprt *xprt) { xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state) || xprt->shutdown) { + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { smp_mb__before_clear_bit(); clear_bit(XPRT_LOCKED, &xprt->state); smp_mb__after_clear_bit(); @@ -504,9 +504,6 @@ EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); */ void xprt_write_space(struct rpc_xprt *xprt) { - if (unlikely(xprt->shutdown)) - return; - spin_lock_bh(&xprt->transport_lock); if (xprt->snd_task) { dprintk("RPC: write space: waking waiting task on " @@ -679,7 +676,7 @@ xprt_init_autodisconnect(unsigned long data) struct rpc_xprt *xprt = (struct rpc_xprt *)data; spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv) || xprt->shutdown) + if (!list_empty(&xprt->recv)) goto out_abort; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) goto out_abort; @@ -1262,7 +1259,6 @@ out: static void xprt_destroy(struct rpc_xprt *xprt) { dprintk("RPC: destroying transport %p\n", xprt); - xprt->shutdown = 1; del_timer_sync(&xprt->timer); rpc_destroy_wait_queue(&xprt->binding); diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 5d9202dc7cb1..c9aa7a35f3bf 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -199,21 +199,15 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->xprt; int rc = 0; - if (!xprt->shutdown) { - current->flags |= PF_FSTRANS; - xprt_clear_connected(xprt); - - dprintk("RPC: %s: %sconnect\n", __func__, - r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); - rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - goto out; - } - goto out_clear; + current->flags |= PF_FSTRANS; + xprt_clear_connected(xprt); + + dprintk("RPC: %s: %sconnect\n", __func__, + r_xprt->rx_ep.rep_connected != 0 ? "re" : ""); + rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); + if (rc) + xprt_wake_pending_tasks(xprt, rc); -out: - xprt_wake_pending_tasks(xprt, rc); -out_clear: dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); current->flags &= ~PF_FSTRANS; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 97f8918169ed..aaaadfbe36e9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -917,9 +917,6 @@ static void xs_local_data_ready(struct sock *sk, int len) if (skb == NULL) goto out; - if (xprt->shutdown) - goto dropit; - repsize = skb->len - sizeof(rpc_fraghdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d\n", repsize); @@ -981,9 +978,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) goto out; - if (xprt->shutdown) - goto dropit; - repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); @@ -1412,9 +1406,6 @@ static void xs_tcp_data_ready(struct sock *sk, int bytes) read_lock_bh(&sk->sk_callback_lock); if (!(xprt = xprt_from_sock(sk))) goto out; - if (xprt->shutdown) - goto out; - /* Any data means we had a useful conversation, so * the we don't need to delay the next reconnect */ @@ -1901,9 +1892,6 @@ static void xs_local_setup_socket(struct work_struct *work) struct socket *sock; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); @@ -2020,9 +2008,6 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; /* Start by resetting any existing state */ @@ -2168,9 +2153,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - if (xprt->shutdown) - goto out; - current->flags |= PF_FSTRANS; if (!sock) { -- cgit v1.2.3 From 9b96ce71974127af0304514d310abe596426c112 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Sep 2012 20:24:16 -0400 Subject: SUNRPC: Limit the rpciod workqueue concurrency We shouldn't need more than 1 worker thread per cpu, since rpciod is designed to run without sleeping in most cases. Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 128494ec9a64..6357fcb00c7e 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -1022,7 +1022,7 @@ static int rpciod_start(void) * Create the rpciod thread and wait for it to start. */ dprintk("RPC: creating workqueue rpciod\n"); - wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 0); + wq = alloc_workqueue("rpciod", WQ_MEM_RECLAIM, 1); rpciod_workqueue = wq; return rpciod_workqueue != NULL; } -- cgit v1.2.3 From 290e33593d76d1cebf873da50e036559c4820af9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 17 Aug 2012 09:47:49 -0700 Subject: libceph: remove unused monc->have_fsid This is unused; use monc->client->have_fsid. Signed-off-by: Sage Weil --- net/ceph/mon_client.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 900ea0f043fc..e98f6070b5ae 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -769,7 +769,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc) monc->monmap->mon_inst[i].name.num = cpu_to_le64(i); } monc->monmap->num_mon = num_mon; - monc->have_fsid = false; return 0; } -- cgit v1.2.3 From 7698f2f5e0d7b7a062213fa970b7c4e121adf38e Mon Sep 17 00:00:00 2001 From: Iulius Curt Date: Thu, 23 Aug 2012 15:14:29 +0300 Subject: libceph: Fix sparse warning Make ceph_monc_do_poolop() static to remove the following sparse warning: * net/ceph/mon_client.c:616:5: warning: symbol 'ceph_monc_do_poolop' was not declared. Should it be static? Also drops the 'ceph_monc_' prefix, now being a private function. Signed-off-by: Iulius Curt Signed-off-by: Sage Weil --- net/ceph/mon_client.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index e98f6070b5ae..812eb3b46c1f 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -637,7 +637,7 @@ bad: /* * Do a synchronous pool op. */ -int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op, +static int do_poolop(struct ceph_mon_client *monc, u32 op, u32 pool, u64 snapid, char *buf, int len) { @@ -687,7 +687,7 @@ out: int ceph_monc_create_snapid(struct ceph_mon_client *monc, u32 pool, u64 *snapid) { - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, pool, 0, (char *)snapid, sizeof(*snapid)); } @@ -696,7 +696,7 @@ EXPORT_SYMBOL(ceph_monc_create_snapid); int ceph_monc_delete_snapid(struct ceph_mon_client *monc, u32 pool, u64 snapid) { - return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, + return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, pool, snapid, 0, 0); } -- cgit v1.2.3 From cc4829e5967de577794b25dfcd1a65e509d171ed Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 5 Sep 2012 14:34:32 +0800 Subject: ceph: use list_move_tail instead of list_del/list_add_tail Using list_move_tail() instead of list_del() + list_add_tail(). Signed-off-by: Wei Yongjun Signed-off-by: Sage Weil --- net/ceph/pagelist.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 665cd23020ff..92866bebb65f 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -1,4 +1,3 @@ - #include #include #include @@ -134,8 +133,8 @@ int ceph_pagelist_truncate(struct ceph_pagelist *pl, ceph_pagelist_unmap_tail(pl); while (pl->head.prev != c->page_lru) { page = list_entry(pl->head.prev, struct page, lru); - list_del(&page->lru); /* remove from pagelist */ - list_add_tail(&page->lru, &pl->free_list); /* add to reserve */ + /* move from pagelist to reserve */ + list_move_tail(&page->lru, &pl->free_list); ++pl->num_pages_free; } pl->room = c->room; -- cgit v1.2.3 From d63b77f4c552cc3a20506871046ab0fcbc332609 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 20:59:48 -0700 Subject: libceph: check for invalid mapping If we encounter an invalid (e.g., zeroed) mapping, return an error and avoid a divide by zero. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 32 ++++++++++++++++++++------------ net/ceph/osdmap.c | 18 ++++++++++++++++-- 2 files changed, 36 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 42119c05e82c..f7b56e23988c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -52,7 +52,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -void ceph_calc_raw_layout(struct ceph_osd_client *osdc, +int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, u64 snapid, u64 off, u64 *plen, u64 *bno, @@ -62,12 +62,15 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen; /* extent in object */ + int r; reqhead->snapid = cpu_to_le64(snapid); /* object extent? */ - ceph_calc_file_object_mapping(layout, off, plen, bno, - &objoff, &objlen); + r = ceph_calc_file_object_mapping(layout, off, plen, bno, + &objoff, &objlen); + if (r < 0) + return r; if (*plen < orig_len) dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); @@ -83,7 +86,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc, dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", *bno, objoff, objlen, req->r_num_pages); - + return 0; } EXPORT_SYMBOL(ceph_calc_raw_layout); @@ -112,20 +115,25 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static void calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) +static int calc_layout(struct ceph_osd_client *osdc, + struct ceph_vino vino, + struct ceph_file_layout *layout, + u64 off, u64 *plen, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) { u64 bno; + int r; - ceph_calc_raw_layout(osdc, layout, vino.snap, off, - plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, + plen, &bno, req, op); + if (r < 0) + return r; snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); req->r_oid_len = strlen(req->r_oid); + + return r; } /* diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 3124b71a8883..5433fb0eb3c6 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -984,7 +984,7 @@ bad: * for now, we write only a single su, until we can * pass a stride back to the caller. */ -void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, +int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *ono, u64 *oxoff, u64 *oxlen) @@ -998,11 +998,17 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, osize, su); + if (su == 0 || sc == 0) + goto invalid; su_per_object = osize / su; + if (su_per_object == 0) + goto invalid; dout("osize %u / su %u = su_per_object %u\n", osize, su, su_per_object); - BUG_ON((su & ~PAGE_MASK) != 0); + if ((su & ~PAGE_MASK) != 0) + goto invalid; + /* bl = *off / su; */ t = off; do_div(t, su); @@ -1030,6 +1036,14 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout, *plen = *oxlen; dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); + return 0; + +invalid: + dout(" invalid layout\n"); + *ono = 0; + *oxoff = 0; + *oxlen = 0; + return -EINVAL; } EXPORT_SYMBOL(ceph_calc_file_object_mapping); -- cgit v1.2.3 From 6816282dab3a72efe8c0d182c1bc2960d87f4322 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 24 Sep 2012 21:01:02 -0700 Subject: ceph: propagate layout error on osd request creation If we are creating an osd request and get an invalid layout, return an EINVAL to the caller. We switch up the return to have an error code instead of NULL implying -ENOMEM. Signed-off-by: Sage Weil Reviewed-by: Alex Elder --- net/ceph/osd_client.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index f7b56e23988c..ccbdfbba9e53 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -464,6 +464,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, { struct ceph_osd_req_op ops[3]; struct ceph_osd_request *req; + int r; ops[0].op = opcode; ops[0].extent.truncate_seq = truncate_seq; @@ -482,10 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, use_mempool, GFP_NOFS, NULL, NULL); if (!req) - return NULL; + return ERR_PTR(-ENOMEM); /* calculate max write size */ - calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(osdc, vino, layout, off, plen, req, ops); + if (r < 0) + return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ /* in case it differs from natural (file) alignment that @@ -1928,8 +1931,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, 0, truncate_seq, truncate_size, NULL, false, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short read due to an object boundary */ req->r_pages = pages; @@ -1971,8 +1974,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, snapc, do_sync, truncate_seq, truncate_size, mtime, nofail, 1, page_align); - if (!req) - return -ENOMEM; + if (IS_ERR(req)) + return PTR_ERR(req); /* it may be a short write due to an object boundary */ req->r_pages = pages; -- cgit v1.2.3 From d8af9bc16c9e9506c10581111f897be04486218f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:34 -0400 Subject: SUNRPC: Clean up dprintk messages in rpc_pipe.c Clean up: The blank space in front of the message must be spaces. Tabs show up on the console as a graphical character. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpc_pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 21fde99e5c56..80f5dd23417d 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -1119,8 +1119,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; if (rpc_populate(root, files, RPCAUTH_lockd, RPCAUTH_RootEOF, NULL)) return -ENOMEM; - dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", net, - NET_NAME(net)); + dprintk("RPC: sending pipefs MOUNT notification for net %p%s\n", + net, NET_NAME(net)); sn->pipefs_sb = sb; err = blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_MOUNT, @@ -1155,8 +1155,8 @@ static void rpc_kill_sb(struct super_block *sb) sn->pipefs_sb = NULL; mutex_unlock(&sn->pipefs_sb_lock); put_net(net); - dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", net, - NET_NAME(net)); + dprintk("RPC: sending pipefs UMOUNT notification for net %p%s\n", + net, NET_NAME(net)); blocking_notifier_call_chain(&rpc_pipefs_notifier_list, RPC_PIPEFS_UMOUNT, sb); -- cgit v1.2.3 From 632f0d0503accb8ab749a1165af99d344579c37b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:43 -0400 Subject: SUNRPC: Use __func__ in dprintk() in auth_gss.c Clean up: Some function names have changed, but debugging messages were never updated. Automate the construction of the function name in debugging messages. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/auth_gss.c | 58 +++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 34c522021004..909dc0c31aab 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -239,7 +239,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct } return q; err: - dprintk("RPC: gss_fill_context returning %ld\n", -PTR_ERR(p)); + dprintk("RPC: %s returning %ld\n", __func__, -PTR_ERR(p)); return p; } @@ -301,10 +301,10 @@ __gss_find_upcall(struct rpc_pipe *pipe, uid_t uid) if (pos->uid != uid) continue; atomic_inc(&pos->count); - dprintk("RPC: gss_find_upcall found msg %p\n", pos); + dprintk("RPC: %s found msg %p\n", __func__, pos); return pos; } - dprintk("RPC: gss_find_upcall found nothing\n"); + dprintk("RPC: %s found nothing\n", __func__); return NULL; } @@ -507,8 +507,8 @@ gss_refresh_upcall(struct rpc_task *task) struct rpc_pipe *pipe; int err = 0; - dprintk("RPC: %5u gss_refresh_upcall for uid %u\n", task->tk_pid, - cred->cr_uid); + dprintk("RPC: %5u %s for uid %u\n", + task->tk_pid, __func__, cred->cr_uid); gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { /* XXX: warning on the first, under the assumption we @@ -539,8 +539,8 @@ gss_refresh_upcall(struct rpc_task *task) spin_unlock(&pipe->lock); gss_release_msg(gss_msg); out: - dprintk("RPC: %5u gss_refresh_upcall for uid %u result %d\n", - task->tk_pid, cred->cr_uid, err); + dprintk("RPC: %5u %s for uid %u result %d\n", + task->tk_pid, __func__, cred->cr_uid, err); return err; } @@ -553,7 +553,7 @@ gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) DEFINE_WAIT(wait); int err = 0; - dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); + dprintk("RPC: %s for uid %u\n", __func__, cred->cr_uid); retry: gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); if (PTR_ERR(gss_msg) == -EAGAIN) { @@ -594,8 +594,8 @@ out_intr: finish_wait(&gss_msg->waitqueue, &wait); gss_release_msg(gss_msg); out: - dprintk("RPC: gss_create_upcall for uid %u result %d\n", - cred->cr_uid, err); + dprintk("RPC: %s for uid %u result %d\n", + __func__, cred->cr_uid, err); return err; } @@ -681,7 +681,7 @@ err_put_ctx: err: kfree(buf); out: - dprintk("RPC: gss_pipe_downcall returning %Zd\n", err); + dprintk("RPC: %s returning %Zd\n", __func__, err); return err; } @@ -747,8 +747,8 @@ gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); if (msg->errno < 0) { - dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", - gss_msg); + dprintk("RPC: %s releasing msg %p\n", + __func__, gss_msg); atomic_inc(&gss_msg->count); gss_unhash_msg(gss_msg); if (msg->errno == -ETIMEDOUT) @@ -976,7 +976,7 @@ gss_destroying_context(struct rpc_cred *cred) static void gss_do_free_ctx(struct gss_cl_ctx *ctx) { - dprintk("RPC: gss_free_ctx\n"); + dprintk("RPC: %s\n", __func__); gss_delete_sec_context(&ctx->gc_gss_ctx); kfree(ctx->gc_wire_ctx.data); @@ -999,7 +999,7 @@ gss_free_ctx(struct gss_cl_ctx *ctx) static void gss_free_cred(struct gss_cred *gss_cred) { - dprintk("RPC: gss_free_cred %p\n", gss_cred); + dprintk("RPC: %s cred=%p\n", __func__, gss_cred); kfree(gss_cred); } @@ -1049,8 +1049,8 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) struct gss_cred *cred = NULL; int err = -ENOMEM; - dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", - acred->uid, auth->au_flavor); + dprintk("RPC: %s for uid %d, flavor %d\n", + __func__, acred->uid, auth->au_flavor); if (!(cred = kzalloc(sizeof(*cred), GFP_NOFS))) goto out_err; @@ -1069,7 +1069,7 @@ gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) return &cred->gc_base; out_err: - dprintk("RPC: gss_create_cred failed with error %d\n", err); + dprintk("RPC: %s failed with error %d\n", __func__, err); return ERR_PTR(err); } @@ -1127,7 +1127,7 @@ gss_marshal(struct rpc_task *task, __be32 *p) struct kvec iov; struct xdr_buf verf_buf; - dprintk("RPC: %5u gss_marshal\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); *p++ = htonl(RPC_AUTH_GSS); cred_len = p++; @@ -1253,7 +1253,7 @@ gss_validate(struct rpc_task *task, __be32 *p) u32 flav,len; u32 maj_stat; - dprintk("RPC: %5u gss_validate\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); flav = ntohl(*p++); if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE) @@ -1271,20 +1271,20 @@ gss_validate(struct rpc_task *task, __be32 *p) if (maj_stat == GSS_S_CONTEXT_EXPIRED) clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags); if (maj_stat) { - dprintk("RPC: %5u gss_validate: gss_verify_mic returned " - "error 0x%08x\n", task->tk_pid, maj_stat); + dprintk("RPC: %5u %s: gss_verify_mic returned error 0x%08x\n", + task->tk_pid, __func__, maj_stat); goto out_bad; } /* We leave it to unwrap to calculate au_rslack. For now we just * calculate the length of the verifier: */ cred->cr_auth->au_verfsize = XDR_QUADLEN(len) + 2; gss_put_ctx(ctx); - dprintk("RPC: %5u gss_validate: gss_verify_mic succeeded.\n", - task->tk_pid); + dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n", + task->tk_pid, __func__); return p + XDR_QUADLEN(len); out_bad: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_validate failed.\n", task->tk_pid); + dprintk("RPC: %5u %s failed.\n", task->tk_pid, __func__); return NULL; } @@ -1466,7 +1466,7 @@ gss_wrap_req(struct rpc_task *task, struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); int status = -EIO; - dprintk("RPC: %5u gss_wrap_req\n", task->tk_pid); + dprintk("RPC: %5u %s\n", task->tk_pid, __func__); if (ctx->gc_proc != RPC_GSS_PROC_DATA) { /* The spec seems a little ambiguous here, but I think that not * wrapping context destruction requests makes the most sense. @@ -1489,7 +1489,7 @@ gss_wrap_req(struct rpc_task *task, } out: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_wrap_req returning %d\n", task->tk_pid, status); + dprintk("RPC: %5u %s returning %d\n", task->tk_pid, __func__, status); return status; } @@ -1604,8 +1604,8 @@ out_decode: status = gss_unwrap_req_decode(decode, rqstp, p, obj); out: gss_put_ctx(ctx); - dprintk("RPC: %5u gss_unwrap_resp returning %d\n", task->tk_pid, - status); + dprintk("RPC: %5u %s returning %d\n", + task->tk_pid, __func__, status); return status; } -- cgit v1.2.3 From 1b63a75180c6c65c71655c250a4e6b578ba7d1c0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:23:52 -0400 Subject: SUNRPC: Refactor rpc_clone_client() rpc_clone_client() does most of the same tasks as rpc_new_client(), so there is an opportunity for code re-use. Create a generic helper that makes it easy to clone an RPC client while replacing any of the clnt's parameters. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 83 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 43 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index fa48c60aef23..afbeefab6600 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -490,59 +490,62 @@ EXPORT_SYMBOL_GPL(rpc_create); * same transport while varying parameters such as the authentication * flavour. */ -struct rpc_clnt * -rpc_clone_client(struct rpc_clnt *clnt) +static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args, + struct rpc_clnt *clnt) { - struct rpc_clnt *new; struct rpc_xprt *xprt; - int err = -ENOMEM; + struct rpc_clnt *new; + int err; - new = kmemdup(clnt, sizeof(*new), GFP_KERNEL); - if (!new) - goto out_no_clnt; - new->cl_parent = clnt; - /* Turn off autobind on clones */ - new->cl_autobind = 0; - INIT_LIST_HEAD(&new->cl_tasks); - spin_lock_init(&new->cl_lock); - rpc_init_rtt(&new->cl_rtt_default, clnt->cl_timeout->to_initval); - new->cl_metrics = rpc_alloc_iostats(clnt); - if (new->cl_metrics == NULL) - goto out_no_stats; - if (clnt->cl_principal) { - new->cl_principal = kstrdup(clnt->cl_principal, GFP_KERNEL); - if (new->cl_principal == NULL) - goto out_no_principal; - } + err = -ENOMEM; rcu_read_lock(); xprt = xprt_get(rcu_dereference(clnt->cl_xprt)); rcu_read_unlock(); if (xprt == NULL) - goto out_no_transport; - rcu_assign_pointer(new->cl_xprt, xprt); - atomic_set(&new->cl_count, 1); - err = rpc_setup_pipedir(new, clnt->cl_program->pipe_dir_name); - if (err != 0) - goto out_no_path; - rpc_clnt_set_nodename(new, utsname()->nodename); - if (new->cl_auth) - atomic_inc(&new->cl_auth->au_count); + goto out_err; + args->servername = xprt->servername; + + new = rpc_new_client(args, xprt); + if (IS_ERR(new)) { + err = PTR_ERR(new); + goto out_put; + } + atomic_inc(&clnt->cl_count); - rpc_register_client(new); - rpciod_up(); + new->cl_parent = clnt; + + /* Turn off autobind on clones */ + new->cl_autobind = 0; + new->cl_softrtry = clnt->cl_softrtry; + new->cl_discrtry = clnt->cl_discrtry; + new->cl_chatty = clnt->cl_chatty; return new; -out_no_path: + +out_put: xprt_put(xprt); -out_no_transport: - kfree(new->cl_principal); -out_no_principal: - rpc_free_iostats(new->cl_metrics); -out_no_stats: - kfree(new); -out_no_clnt: +out_err: dprintk("RPC: %s: returned error %d\n", __func__, err); return ERR_PTR(err); } + +/** + * rpc_clone_client - Clone an RPC client structure + * + * @clnt: RPC client whose parameters are copied + * + * Returns a fresh RPC client or an ERR_PTR. + */ +struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) +{ + struct rpc_create_args args = { + .program = clnt->cl_program, + .prognumber = clnt->cl_prog, + .version = clnt->cl_vers, + .authflavor = clnt->cl_auth->au_flavor, + .client_name = clnt->cl_principal, + }; + return __rpc_clone_client(&args, clnt); +} EXPORT_SYMBOL_GPL(rpc_clone_client); /* -- cgit v1.2.3 From ba9b584c1dc37851d9c6ca6d0d2ccba55d9aad04 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 14 Sep 2012 17:24:02 -0400 Subject: SUNRPC: Introduce rpc_clone_client_set_auth() An ULP is supposed to be able to replace a GSS rpc_auth object with another GSS rpc_auth object using rpcauth_create(). However, rpcauth_create() in 3.5 reliably fails with -EEXIST in this case. This is because when gss_create() attempts to create the upcall pipes, sometimes they are already there. For example if a pipe FS mount event occurs, or a previous GSS flavor was in use for this rpc_clnt. It turns out that's not the only problem here. While working on a fix for the above problem, we noticed that replacing an rpc_clnt's rpc_auth is not safe, since dereferencing the cl_auth field is not protected in any way. So we're deprecating the ability of rpcauth_create() to switch an rpc_clnt's security flavor during normal operation. Instead, let's add a fresh API that clones an rpc_clnt and gives the clone a new flavor before it's used. This makes immediate use of the new __rpc_clone_client() helper. This can be used in a similar fashion to rpcauth_create() when a client is hunting for the correct security flavor. Instead of replacing an rpc_clnt's security flavor in a loop, the ULP replaces the whole rpc_clnt. To fix the -EEXIST problem, any ULP logic that relies on replacing an rpc_clnt's rpc_auth with rpcauth_create() must be changed to use this API instead. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'net') diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index afbeefab6600..cdc7564b4512 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -548,6 +548,28 @@ struct rpc_clnt *rpc_clone_client(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_clone_client); +/** + * rpc_clone_client_set_auth - Clone an RPC client structure and set its auth + * + * @clnt: RPC client whose parameters are copied + * @auth: security flavor for new client + * + * Returns a fresh RPC client or an ERR_PTR. + */ +struct rpc_clnt * +rpc_clone_client_set_auth(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + struct rpc_create_args args = { + .program = clnt->cl_program, + .prognumber = clnt->cl_prog, + .version = clnt->cl_vers, + .authflavor = flavor, + .client_name = clnt->cl_principal, + }; + return __rpc_clone_client(&args, clnt); +} +EXPORT_SYMBOL_GPL(rpc_clone_client_set_auth); + /* * Kill all tasks for the given client. * XXX: kill their descendants as well? -- cgit v1.2.3 From 6299b669b1340b9f7de2bc2bd565921a1494e7f7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Oct 2012 17:12:08 -0700 Subject: sections: fix section conflicts in net/can Signed-off-by: Andi Kleen Cc: Oliver Hartkopp Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/can/af_can.c | 2 +- net/can/bcm.c | 2 +- net/can/gw.c | 2 +- net/can/raw.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/can/af_can.c b/net/can/af_can.c index 821022a7214f..ddac1ee2ed20 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -63,7 +63,7 @@ #include "af_can.h" -static __initdata const char banner[] = KERN_INFO +static __initconst const char banner[] = KERN_INFO "can: controller area network core (" CAN_VERSION_STRING ")\n"; MODULE_DESCRIPTION("Controller Area Network PF_CAN core"); diff --git a/net/can/bcm.c b/net/can/bcm.c index 151b7730c12c..6f747582718e 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -77,7 +77,7 @@ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) #define CAN_BCM_VERSION CAN_VERSION -static __initdata const char banner[] = KERN_INFO +static __initconst const char banner[] = KERN_INFO "can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n"; MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); diff --git a/net/can/gw.c b/net/can/gw.c index 127879c55fb6..1f5c9785a262 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -58,7 +58,7 @@ #include #define CAN_GW_VERSION "20101209" -static __initdata const char banner[] = +static __initconst const char banner[] = KERN_INFO "can: netlink gateway (rev " CAN_GW_VERSION ")\n"; MODULE_DESCRIPTION("PF_CAN netlink gateway"); diff --git a/net/can/raw.c b/net/can/raw.c index 3e9c89356a93..5b0e3e330d97 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -55,7 +55,7 @@ #include #define CAN_RAW_VERSION CAN_VERSION -static __initdata const char banner[] = +static __initconst const char banner[] = KERN_INFO "can: raw protocol (rev " CAN_RAW_VERSION ")\n"; MODULE_DESCRIPTION("PF_CAN raw protocol"); -- cgit v1.2.3 From 04a6f82cf01aeef9fb058b2fca0ef1fe0a09c2fa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Oct 2012 17:12:11 -0700 Subject: sections: fix section conflicts in net Signed-off-by: Andi Kleen Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/decnet/dn_rules.c | 2 +- net/ipv4/fib_rules.c | 2 +- net/ipv4/ipmr.c | 2 +- net/ipv6/addrlabel.c | 2 +- net/ipv6/fib6_rules.c | 2 +- net/ipv6/ip6mr.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c index e65f2c856e06..faf7cc3483fe 100644 --- a/net/decnet/dn_rules.c +++ b/net/decnet/dn_rules.c @@ -220,7 +220,7 @@ static void dn_fib_rule_flush_cache(struct fib_rules_ops *ops) dn_rt_cache_flush(-1); } -static const struct fib_rules_ops __net_initdata dn_fib_rules_ops_template = { +static const struct fib_rules_ops __net_initconst dn_fib_rules_ops_template = { .family = AF_DECnet, .rule_size = sizeof(struct dn_fib_rule), .addr_size = sizeof(u16), diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 274309d3aded..26aa65d1fce4 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -262,7 +262,7 @@ static void fib4_rule_flush_cache(struct fib_rules_ops *ops) rt_cache_flush(ops->fro_net); } -static const struct fib_rules_ops __net_initdata fib4_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib4_rules_ops_template = { .family = AF_INET, .rule_size = sizeof(struct fib4_rule), .addr_size = sizeof(u32), diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1daa95c2a0ba..6168c4dc58b1 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -221,7 +221,7 @@ static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, return 0; } -static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = { +static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = { .family = RTNL_FAMILY_IPMR, .rule_size = sizeof(struct ipmr_rule), .addr_size = sizeof(u32), diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 4be23da32b89..ff76eecfd622 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -79,7 +79,7 @@ struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) #define IPV6_ADDR_LABEL_DEFAULT 0xffffffffUL -static const __net_initdata struct ip6addrlbl_init_table +static const __net_initconst struct ip6addrlbl_init_table { const struct in6_addr *prefix; int prefixlen; diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 0ff1cfd55bc4..d9fb9110f607 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -238,7 +238,7 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule) + nla_total_size(16); /* src */ } -static const struct fib_rules_ops __net_initdata fib6_rules_ops_template = { +static const struct fib_rules_ops __net_initconst fib6_rules_ops_template = { .family = AF_INET6, .rule_size = sizeof(struct fib6_rule), .addr_size = sizeof(struct in6_addr), diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 08ea3f0b6e55..f7c7c6319720 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -205,7 +205,7 @@ static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, return 0; } -static const struct fib_rules_ops __net_initdata ip6mr_rules_ops_template = { +static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), -- cgit v1.2.3 From 6dc878a8ca39e93f70c42f3dd7260bde10c1e0f1 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 4 Oct 2012 20:15:48 +0000 Subject: netlink: add reference of module in netlink_dump_start I get a panic when I use ss -a and rmmod inet_diag at the same time. It's because netlink_dump uses inet_diag_dump which belongs to module inet_diag. I search the codes and find many modules have the same problem. We need to add a reference to the module which the cb->dump belongs to. Thanks for all help from Stephen,Jan,Eric,Steffen and Pablo. Change From v3: change netlink_dump_start to inline,suggestion from Pablo and Eric. Change From v2: delete netlink_dump_done,and call module_put in netlink_dump and netlink_sock_destruct. Signed-off-by: Gao feng Signed-off-by: David S. Miller --- net/netlink/af_netlink.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0f2e3ad69c47..01e944a017a4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -169,6 +169,8 @@ static void netlink_sock_destruct(struct sock *sk) if (nlk->cb) { if (nlk->cb->done) nlk->cb->done(nlk->cb); + + module_put(nlk->cb->module); netlink_destroy_callback(nlk->cb); } @@ -1758,6 +1760,7 @@ static int netlink_dump(struct sock *sk) nlk->cb = NULL; mutex_unlock(nlk->cb_mutex); + module_put(cb->module); netlink_consume_callback(cb); return 0; @@ -1767,9 +1770,9 @@ errout_skb: return err; } -int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, - const struct nlmsghdr *nlh, - struct netlink_dump_control *control) +int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + const struct nlmsghdr *nlh, + struct netlink_dump_control *control) { struct netlink_callback *cb; struct sock *sk; @@ -1784,6 +1787,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->done = control->done; cb->nlh = nlh; cb->data = control->data; + cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; atomic_inc(&skb->users); cb->skb = skb; @@ -1794,19 +1798,28 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, return -ECONNREFUSED; } nlk = nlk_sk(sk); - /* A dump is in progress... */ + mutex_lock(nlk->cb_mutex); + /* A dump is in progress... */ if (nlk->cb) { mutex_unlock(nlk->cb_mutex); netlink_destroy_callback(cb); - sock_put(sk); - return -EBUSY; + ret = -EBUSY; + goto out; } + /* add reference of module which cb->dump belongs to */ + if (!try_module_get(cb->module)) { + mutex_unlock(nlk->cb_mutex); + netlink_destroy_callback(cb); + ret = -EPROTONOSUPPORT; + goto out; + } + nlk->cb = cb; mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); - +out: sock_put(sk); if (ret) @@ -1817,7 +1830,7 @@ int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, */ return -EINTR; } -EXPORT_SYMBOL(netlink_dump_start); +EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) { -- cgit v1.2.3 From acb600def2110b1310466c0e485c0d26299898ae Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Oct 2012 06:23:55 +0000 Subject: net: remove skb recycling Over time, skb recycling infrastructure got litle interest and many bugs. Generic rx path skb allocation is now using page fragments for efficient GRO / TCP coalescing, and recyling a tx skb for rx path is not worth the pain. Last identified bug is that fat skbs can be recycled and it can endup using high order pages after few iterations. With help from Maxime Bizon, who pointed out that commit 87151b8689d (net: allow pskb_expand_head() to get maximum tailroom) introduced this regression for recycled skbs. Instead of fixing this bug, lets remove skb recycling. Drivers wanting really hot skbs should use build_skb() anyway, to allocate/populate sk_buff right before netif_receive_skb() Signed-off-by: Eric Dumazet Cc: Maxime Bizon Signed-off-by: David S. Miller --- net/core/skbuff.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index cdc28598f4ef..6e04b1fa11f2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -655,53 +655,6 @@ void consume_skb(struct sk_buff *skb) } EXPORT_SYMBOL(consume_skb); -/** - * skb_recycle - clean up an skb for reuse - * @skb: buffer - * - * Recycles the skb to be reused as a receive buffer. This - * function does any necessary reference count dropping, and - * cleans up the skbuff as if it just came from __alloc_skb(). - */ -void skb_recycle(struct sk_buff *skb) -{ - struct skb_shared_info *shinfo; - - skb_release_head_state(skb); - - shinfo = skb_shinfo(skb); - memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); - atomic_set(&shinfo->dataref, 1); - - memset(skb, 0, offsetof(struct sk_buff, tail)); - skb->data = skb->head + NET_SKB_PAD; - skb_reset_tail_pointer(skb); -} -EXPORT_SYMBOL(skb_recycle); - -/** - * skb_recycle_check - check if skb can be reused for receive - * @skb: buffer - * @skb_size: minimum receive buffer size - * - * Checks that the skb passed in is not shared or cloned, and - * that it is linear and its head portion at least as large as - * skb_size so that it can be recycled as a receive buffer. - * If these conditions are met, this function does any necessary - * reference count dropping and cleans up the skbuff as if it - * just came from __alloc_skb(). - */ -bool skb_recycle_check(struct sk_buff *skb, int skb_size) -{ - if (!skb_is_recycleable(skb, skb_size)) - return false; - - skb_recycle(skb); - - return true; -} -EXPORT_SYMBOL(skb_recycle_check); - static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) { new->tstamp = old->tstamp; -- cgit v1.2.3 From e1f165032c8bade3a6bdf546f8faf61fda4dd01c Mon Sep 17 00:00:00 2001 From: "ramesh.nagappa@gmail.com" Date: Fri, 5 Oct 2012 19:10:15 +0000 Subject: net: Fix skb_under_panic oops in neigh_resolve_output The retry loop in neigh_resolve_output() and neigh_connected_output() call dev_hard_header() with out reseting the skb to network_header. This causes the retry to fail with skb_under_panic. The fix is to reset the network_header within the retry loop. Signed-off-by: Ramesh Nagappa Reviewed-by: Shawn Lu Reviewed-by: Robert Coulson Reviewed-by: Billie Alsup Signed-off-by: David S. Miller --- net/core/neighbour.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index baca771caae2..22571488730a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1301,8 +1301,6 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) if (!dst) goto discard; - __skb_pull(skb, skb_network_offset(skb)); - if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; @@ -1312,6 +1310,7 @@ int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) neigh_hh_init(neigh, dst); do { + __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); @@ -1342,9 +1341,8 @@ int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb) unsigned int seq; int err; - __skb_pull(skb, skb_network_offset(skb)); - do { + __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); -- cgit v1.2.3 From 51ec04038c113a811b177baa85d293feff9ce995 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 5 Oct 2012 20:43:30 +0000 Subject: ipv6: GRO should be ECN friendly IPv4 side of the problem was addressed in commit a9e050f4e7f9d (net: tcp: GRO should be ECN friendly) This patch does the same, but for IPv6 : A Traffic Class mismatch doesnt mean flows are different, but instead should force a flush of previous packets. This patch removes artificial packet reordering problem. Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index e22e6d88bac6..f757e3b7cfbf 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -880,22 +880,25 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, nlen = skb_network_header_len(skb); for (p = *head; p; p = p->next) { - struct ipv6hdr *iph2; + const struct ipv6hdr *iph2; + __be32 first_word; /* */ if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = ipv6_hdr(p); + first_word = *(__be32 *)iph ^ *(__be32 *)iph2 ; - /* All fields must match except length. */ + /* All fields must match except length and Traffic Class. */ if (nlen != skb_network_header_len(p) || - memcmp(iph, iph2, offsetof(struct ipv6hdr, payload_len)) || + (first_word & htonl(0xF00FFFFF)) || memcmp(&iph->nexthdr, &iph2->nexthdr, nlen - offsetof(struct ipv6hdr, nexthdr))) { NAPI_GRO_CB(p)->same_flow = 0; continue; } - + /* flush if Traffic Class fields are different */ + NAPI_GRO_CB(p)->flush |= !!(first_word & htonl(0x0FF00000)); NAPI_GRO_CB(p)->flush |= flush; } -- cgit v1.2.3 From ca07e43e288956a0ad5e6bd075f7aa1fca3bca00 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Oct 2012 22:28:06 +0000 Subject: net: gro: fix a potential crash in skb_gro_reset_offset Before accessing skb first fragment, better make sure there is one. This is probably not needed for old kernels, since an ethernet frame cannot contain only an ethernet header, but the recent GRO addition to tunnels makes this patch needed. Also skb_gro_reset_offset() can be static, it actually allows compiler to inline it. Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1e0a1847c3bb..de2bad717d56 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3631,20 +3631,22 @@ gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb) } EXPORT_SYMBOL(napi_skb_finish); -void skb_gro_reset_offset(struct sk_buff *skb) +static void skb_gro_reset_offset(struct sk_buff *skb) { + const struct skb_shared_info *pinfo = skb_shinfo(skb); + const skb_frag_t *frag0 = &pinfo->frags[0]; + NAPI_GRO_CB(skb)->data_offset = 0; NAPI_GRO_CB(skb)->frag0 = NULL; NAPI_GRO_CB(skb)->frag0_len = 0; if (skb->mac_header == skb->tail && - !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) { - NAPI_GRO_CB(skb)->frag0 = - skb_frag_address(&skb_shinfo(skb)->frags[0]); - NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]); + pinfo->nr_frags && + !PageHighMem(skb_frag_page(frag0))) { + NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); + NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0); } } -EXPORT_SYMBOL(skb_gro_reset_offset); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.2.3 From cf7f601c067994f371ba77721d1e45fce61a4569 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 13 Sep 2012 13:06:29 +0100 Subject: KEYS: Add payload preparsing opportunity prior to key instantiate or update Give the key type the opportunity to preparse the payload prior to the instantiation and update routines being called. This is done with the provision of two new key type operations: int (*preparse)(struct key_preparsed_payload *prep); void (*free_preparse)(struct key_preparsed_payload *prep); If the first operation is present, then it is called before key creation (in the add/update case) or before the key semaphore is taken (in the update and instantiate cases). The second operation is called to clean up if the first was called. preparse() is given the opportunity to fill in the following structure: struct key_preparsed_payload { char *description; void *type_data[2]; void *payload; const void *data; size_t datalen; size_t quotalen; }; Before the preparser is called, the first three fields will have been cleared, the payload pointer and size will be stored in data and datalen and the default quota size from the key_type struct will be stored into quotalen. The preparser may parse the payload in any way it likes and may store data in the type_data[] and payload fields for use by the instantiate() and update() ops. The preparser may also propose a description for the key by attaching it as a string to the description field. This can be used by passing a NULL or "" description to the add_key() system call or the key_create_or_update() function. This cannot work with request_key() as that required the description to tell the upcall about the key to be created. This, for example permits keys that store PGP public keys to generate their own name from the user ID and public key fingerprint in the key. The instantiate() and update() operations are then modified to look like this: int (*instantiate)(struct key *key, struct key_preparsed_payload *prep); int (*update)(struct key *key, struct key_preparsed_payload *prep); and the new payload data is passed in *prep, whether or not it was preparsed. Signed-off-by: David Howells Signed-off-by: Rusty Russell --- net/ceph/crypto.c | 9 +++++---- net/dns_resolver/dns_key.c | 6 +++--- net/rxrpc/ar-key.c | 40 ++++++++++++++++++++-------------------- 3 files changed, 28 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 9da7fdd3cd8a..af14cb425164 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -423,14 +423,15 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, } } -int ceph_key_instantiate(struct key *key, const void *data, size_t datalen) +int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { struct ceph_crypto_key *ckey; + size_t datalen = prep->datalen; int ret; void *p; ret = -EINVAL; - if (datalen <= 0 || datalen > 32767 || !data) + if (datalen <= 0 || datalen > 32767 || !prep->data) goto err; ret = key_payload_reserve(key, datalen); @@ -443,8 +444,8 @@ int ceph_key_instantiate(struct key *key, const void *data, size_t datalen) goto err; /* TODO ceph_crypto_key_decode should really take const input */ - p = (void *)data; - ret = ceph_crypto_key_decode(ckey, &p, (char*)data+datalen); + p = (void *)prep->data; + ret = ceph_crypto_key_decode(ckey, &p, (char*)prep->data+datalen); if (ret < 0) goto err_ckey; diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index d9507dd05818..859ab8b6ec34 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -59,13 +59,13 @@ const struct cred *dns_resolver_cache; * "ip1,ip2,...#foo=bar" */ static int -dns_resolver_instantiate(struct key *key, const void *_data, size_t datalen) +dns_resolver_instantiate(struct key *key, struct key_preparsed_payload *prep) { struct user_key_payload *upayload; unsigned long derrno; int ret; - size_t result_len = 0; - const char *data = _data, *end, *opt; + size_t datalen = prep->datalen, result_len = 0; + const char *data = prep->data, *end, *opt; kenter("%%%d,%s,'%*.*s',%zu", key->serial, key->description, diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 8b1f9f49960f..106c5a6b1ab2 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -26,8 +26,8 @@ #include "ar-internal.h" static int rxrpc_vet_description_s(const char *); -static int rxrpc_instantiate(struct key *, const void *, size_t); -static int rxrpc_instantiate_s(struct key *, const void *, size_t); +static int rxrpc_instantiate(struct key *, struct key_preparsed_payload *); +static int rxrpc_instantiate_s(struct key *, struct key_preparsed_payload *); static void rxrpc_destroy(struct key *); static void rxrpc_destroy_s(struct key *); static void rxrpc_describe(const struct key *, struct seq_file *); @@ -678,7 +678,7 @@ error: * * if no data is provided, then a no-security key is made */ -static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) +static int rxrpc_instantiate(struct key *key, struct key_preparsed_payload *prep) { const struct rxrpc_key_data_v1 *v1; struct rxrpc_key_token *token, **pp; @@ -686,26 +686,26 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) u32 kver; int ret; - _enter("{%x},,%zu", key_serial(key), datalen); + _enter("{%x},,%zu", key_serial(key), prep->datalen); /* handle a no-security key */ - if (!data && datalen == 0) + if (!prep->data && prep->datalen == 0) return 0; /* determine if the XDR payload format is being used */ - if (datalen > 7 * 4) { - ret = rxrpc_instantiate_xdr(key, data, datalen); + if (prep->datalen > 7 * 4) { + ret = rxrpc_instantiate_xdr(key, prep->data, prep->datalen); if (ret != -EPROTO) return ret; } /* get the key interface version number */ ret = -EINVAL; - if (datalen <= 4 || !data) + if (prep->datalen <= 4 || !prep->data) goto error; - memcpy(&kver, data, sizeof(kver)); - data += sizeof(kver); - datalen -= sizeof(kver); + memcpy(&kver, prep->data, sizeof(kver)); + prep->data += sizeof(kver); + prep->datalen -= sizeof(kver); _debug("KEY I/F VERSION: %u", kver); @@ -715,11 +715,11 @@ static int rxrpc_instantiate(struct key *key, const void *data, size_t datalen) /* deal with a version 1 key */ ret = -EINVAL; - if (datalen < sizeof(*v1)) + if (prep->datalen < sizeof(*v1)) goto error; - v1 = data; - if (datalen != sizeof(*v1) + v1->ticket_length) + v1 = prep->data; + if (prep->datalen != sizeof(*v1) + v1->ticket_length) goto error; _debug("SCIX: %u", v1->security_index); @@ -784,17 +784,17 @@ error: * instantiate a server secret key * data should be a pointer to the 8-byte secret key */ -static int rxrpc_instantiate_s(struct key *key, const void *data, - size_t datalen) +static int rxrpc_instantiate_s(struct key *key, + struct key_preparsed_payload *prep) { struct crypto_blkcipher *ci; - _enter("{%x},,%zu", key_serial(key), datalen); + _enter("{%x},,%zu", key_serial(key), prep->datalen); - if (datalen != 8) + if (prep->datalen != 8) return -EINVAL; - memcpy(&key->type_data, data, 8); + memcpy(&key->type_data, prep->data, 8); ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(ci)) { @@ -802,7 +802,7 @@ static int rxrpc_instantiate_s(struct key *key, const void *data, return PTR_ERR(ci); } - if (crypto_blkcipher_setkey(ci, data, 8) < 0) + if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0) BUG(); key->payload.data = ci; -- cgit v1.2.3 From d851c12b60471188e15e5c8405b289073e8dd025 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 7 Oct 2012 22:47:25 +0000 Subject: ipv4: Always invalidate or update the route on pmtu events Some protocols, like IPsec still cache routes. So we need to invalidate the old route on pmtu events to avoid the reuse of stale routes. We also need to update the mtu and expire time of the route if we already use a nh exception route, otherwise we ignore newly learned pmtu values after the first expiration. With this patch we always invalidate or update the route on pmtu events. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ff622069fcef..90ba8358a892 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -904,22 +904,29 @@ out: kfree_skb(skb); return 0; } -static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) +static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { + struct dst_entry *dst = &rt->dst; struct fib_result res; if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; + if (!rt->rt_pmtu) { + dst->obsolete = DST_OBSOLETE_KILL; + } else { + rt->rt_pmtu = mtu; + dst->expires = max(1UL, jiffies + ip_rt_mtu_expires); + } + rcu_read_lock(); - if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { + if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) { struct fib_nh *nh = &FIB_RES_NH(res); update_or_create_fnhe(nh, fl4->daddr, 0, mtu, jiffies + ip_rt_mtu_expires); } rcu_read_unlock(); - return mtu; } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, @@ -929,14 +936,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); - mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); - - if (!rt->rt_pmtu) { - dst->obsolete = DST_OBSOLETE_KILL; - } else { - rt->rt_pmtu = mtu; - rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); - } + __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, -- cgit v1.2.3 From 7f92d334ba19a0d8e96f8f8f092219553367d921 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Sun, 7 Oct 2012 22:48:18 +0000 Subject: ipv4: Don't create nh exeption when the device mtu is smaller than the reported pmtu When a local tool like tracepath tries to send packets bigger than the device mtu, we create a nh exeption and set the pmtu to device mtu. The device mtu does not expire, so check if the device mtu is smaller than the reported pmtu and don't crerate a nh exeption in that case. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 90ba8358a892..741df67a81ec 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -909,6 +909,9 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) struct dst_entry *dst = &rt->dst; struct fib_result res; + if (dst->dev->mtu < mtu) + return; + if (mtu < ip_rt_min_pmtu) mtu = ip_rt_min_pmtu; -- cgit v1.2.3 From ee9a8f7ab2edf801b8b514c310455c94acc232f6 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 8 Oct 2012 00:56:54 +0000 Subject: ipv4: Don't report stale pmtu values to userspace We report cached pmtu values even if they are already expired. Change this to not report these values after they are expired and fix a race in the expire time calculation, as suggested by Eric Dumazet. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/ipv4/route.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 741df67a81ec..132e0dfee53a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2187,8 +2187,18 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; + expires = rt->dst.expires; + if (expires) { + unsigned long now = jiffies; + + if (time_before(now, expires)) + expires -= now; + else + expires = 0; + } + memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); - if (rt->rt_pmtu) + if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; @@ -2198,13 +2208,6 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; error = rt->dst.error; - expires = rt->dst.expires; - if (expires) { - if (time_before(jiffies, expires)) - expires -= jiffies; - else - expires = 0; - } if (rt_is_input_route(rt)) { if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) -- cgit v1.2.3 From 2e71a6f8084e7ac87166dd77d99c44190fb844fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 6 Oct 2012 08:08:49 +0000 Subject: net: gro: selective flush of packets Current GRO can hold packets in gro_list for almost unlimited time, in case napi->poll() handler consumes its budget over and over. In this case, napi_complete()/napi_gro_flush() are not called. Another problem is that gro_list is flushed in non friendly way : We scan the list and complete packets in the reverse order. (youngest packets first, oldest packets last) This defeats priorities that sender could have cooked. Since GRO currently only store TCP packets, we dont really notice the bug because of retransmits, but this behavior can add unexpected latencies, particularly on mice flows clamped by elephant flows. This patch makes sure no packet can stay more than 1 ms in queue, and only in stress situations. It also complete packets in the right order to minimize latencies. Signed-off-by: Eric Dumazet Cc: Herbert Xu Cc: Jesse Gross Cc: Tom Herbert Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/core/dev.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index de2bad717d56..d44668f63c88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3471,17 +3471,31 @@ out: return netif_receive_skb(skb); } -inline void napi_gro_flush(struct napi_struct *napi) +/* napi->gro_list contains packets ordered by age. + * youngest packets at the head of it. + * Complete skbs in reverse order to reduce latencies. + */ +void napi_gro_flush(struct napi_struct *napi, bool flush_old) { - struct sk_buff *skb, *next; + struct sk_buff *skb, *prev = NULL; - for (skb = napi->gro_list; skb; skb = next) { - next = skb->next; + /* scan list and build reverse chain */ + for (skb = napi->gro_list; skb != NULL; skb = skb->next) { + skb->prev = prev; + prev = skb; + } + + for (skb = prev; skb; skb = prev) { skb->next = NULL; + + if (flush_old && NAPI_GRO_CB(skb)->age == jiffies) + return; + + prev = skb->prev; napi_gro_complete(skb); + napi->gro_count--; } - napi->gro_count = 0; napi->gro_list = NULL; } EXPORT_SYMBOL(napi_gro_flush); @@ -3542,6 +3556,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) napi->gro_count++; NAPI_GRO_CB(skb)->count = 1; + NAPI_GRO_CB(skb)->age = jiffies; skb_shinfo(skb)->gso_size = skb_gro_len(skb); skb->next = napi->gro_list; napi->gro_list = skb; @@ -3878,7 +3893,7 @@ void napi_complete(struct napi_struct *n) if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state))) return; - napi_gro_flush(n); + napi_gro_flush(n, false); local_irq_save(flags); __napi_complete(n); local_irq_restore(flags); @@ -3983,8 +3998,17 @@ static void net_rx_action(struct softirq_action *h) local_irq_enable(); napi_complete(n); local_irq_disable(); - } else + } else { + if (n->gro_list) { + /* flush too old packets + * If HZ < 1000, flush all packets. + */ + local_irq_enable(); + napi_gro_flush(n, HZ >= 1000); + local_irq_disable(); + } list_move_tail(&n->poll_list, &sd->poll_list); + } } netpoll_poll_unlock(have); -- cgit v1.2.3 From 55fabefe3695241e6ccfa0cd4974f3fa497693dc Mon Sep 17 00:00:00 2001 From: Thomas Pedersen Date: Fri, 5 Oct 2012 17:57:39 -0700 Subject: mac80211: call drv_get_tsf() in sleepable context The call to drv_get/set_tsf() was put on the workqueue to perform tsf adjustments since that function might sleep. However it ended up inside a spinlock, whose critical section must be atomic. Do tsf adjustment outside the spinlock instead, and get rid of a warning. Signed-off-by: Thomas Pedersen Signed-off-by: John W. Linville --- net/mac80211/mesh_sync.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_sync.c b/net/mac80211/mesh_sync.c index accfa00ffcdf..a16b7b4b1e02 100644 --- a/net/mac80211/mesh_sync.c +++ b/net/mac80211/mesh_sync.c @@ -56,7 +56,6 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) u64 tsfdelta; spin_lock_bh(&ifmsh->sync_offset_lock); - if (ifmsh->sync_offset_clockdrift_max < beacon_int_fraction) { msync_dbg(sdata, "TBTT : max clockdrift=%lld; adjusting\n", (long long) ifmsh->sync_offset_clockdrift_max); @@ -69,11 +68,11 @@ void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata) tsfdelta = -beacon_int_fraction; ifmsh->sync_offset_clockdrift_max -= beacon_int_fraction; } + spin_unlock_bh(&ifmsh->sync_offset_lock); tsf = drv_get_tsf(local, sdata); if (tsf != -1ULL) drv_set_tsf(local, sdata, tsf + tsfdelta); - spin_unlock_bh(&ifmsh->sync_offset_lock); } static void mesh_sync_offset_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From c3e7724b6bc2f25e46c38dbe68f09d71fafeafb8 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 8 Oct 2012 14:39:33 +0200 Subject: mac80211: use ieee80211_free_txskb to fix possible skb leaks A few places free skbs using dev_kfree_skb even though they're called after ieee80211_subif_start_xmit might have cloned it for tracking tx status. Use ieee80211_free_txskb here to prevent skb leaks. Signed-off-by: Felix Fietkau Cc: stable@vger.kernel.org Signed-off-by: John W. Linville --- net/mac80211/status.c | 4 ++-- net/mac80211/tx.c | 22 ++++++++++++---------- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 2ce89732d0f2..3af0cc4130f1 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -34,7 +34,7 @@ void ieee80211_tx_status_irqsafe(struct ieee80211_hw *hw, skb_queue_len(&local->skb_queue_unreliable); while (tmp > IEEE80211_IRQSAFE_QUEUE_LIMIT && (skb = skb_dequeue(&local->skb_queue_unreliable))) { - dev_kfree_skb_irq(skb); + ieee80211_free_txskb(hw, skb); tmp--; I802_DEBUG_INC(local->tx_status_drop); } @@ -159,7 +159,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", skb_queue_len(&sta->tx_filtered[ac]), !!test_sta_flag(sta, WLAN_STA_PS_STA), jiffies); - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); } static void ieee80211_check_pending_bar(struct sta_info *sta, u8 *addr, u8 tid) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e0e0d1d0e830..c9bf83f36657 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -354,7 +354,7 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) total += skb_queue_len(&sta->ps_tx_buf[ac]); if (skb) { purged++; - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); break; } } @@ -466,7 +466,7 @@ ieee80211_tx_h_unicast_ps_buf(struct ieee80211_tx_data *tx) ps_dbg(tx->sdata, "STA %pM TX buffer for AC %d full - dropping oldest frame\n", sta->sta.addr, ac); - dev_kfree_skb(old); + ieee80211_free_txskb(&local->hw, old); } else tx->local->total_ps_buffered++; @@ -1103,7 +1103,7 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx, spin_unlock(&tx->sta->lock); if (purge_skb) - dev_kfree_skb(purge_skb); + ieee80211_free_txskb(&tx->local->hw, purge_skb); } /* reset session timer */ @@ -1214,7 +1214,7 @@ static bool ieee80211_tx_frags(struct ieee80211_local *local, #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (WARN_ON_ONCE(q >= local->hw.queues)) { __skb_unlink(skb, skbs); - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); continue; } #endif @@ -1356,7 +1356,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) if (unlikely(res == TX_DROP)) { I802_DEBUG_INC(tx->local->tx_handlers_drop); if (tx->skb) - dev_kfree_skb(tx->skb); + ieee80211_free_txskb(&tx->local->hw, tx->skb); else __skb_queue_purge(&tx->skbs); return -1; @@ -1393,7 +1393,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, res_prepare = ieee80211_tx_prepare(sdata, &tx, skb); if (unlikely(res_prepare == TX_DROP)) { - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); goto out; } else if (unlikely(res_prepare == TX_QUEUED)) { goto out; @@ -1465,7 +1465,7 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) headroom = max_t(int, 0, headroom); if (ieee80211_skb_resize(sdata, skb, headroom, may_encrypt)) { - dev_kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); rcu_read_unlock(); return; } @@ -2050,8 +2050,10 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, head_need += IEEE80211_ENCRYPT_HEADROOM; head_need += local->tx_headroom; head_need = max_t(int, 0, head_need); - if (ieee80211_skb_resize(sdata, skb, head_need, true)) - goto fail; + if (ieee80211_skb_resize(sdata, skb, head_need, true)) { + ieee80211_free_txskb(&local->hw, skb); + return NETDEV_TX_OK; + } } if (encaps_data) { @@ -2184,7 +2186,7 @@ void ieee80211_tx_pending(unsigned long data) struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); if (WARN_ON(!info->control.vif)) { - kfree_skb(skb); + ieee80211_free_txskb(&local->hw, skb); continue; } -- cgit v1.2.3 From 48cc32d38a52d0b68f91a171a8d00531edc6a46e Mon Sep 17 00:00:00 2001 From: Florian Zumbiehl Date: Sun, 7 Oct 2012 15:51:58 +0000 Subject: vlan: don't deliver frames for unknown vlans to protocols 6a32e4f9dd9219261f8856f817e6655114cfec2f made the vlan code skip marking vlan-tagged frames for not locally configured vlans as PACKET_OTHERHOST if there was an rx_handler, as the rx_handler could cause the frame to be received on a different (virtual) vlan-capable interface where that vlan might be configured. As rx_handlers do not necessarily return RX_HANDLER_ANOTHER, this could cause frames for unknown vlans to be delivered to the protocol stack as if they had been received untagged. For example, if an ipv6 router advertisement that's tagged for a locally not configured vlan is received on an interface with macvlan interfaces attached, macvlan's rx_handler returns RX_HANDLER_PASS after delivering the frame to the macvlan interfaces, which caused it to be passed to the protocol stack, leading to ipv6 addresses for the announced prefix being configured even though those are completely unusable on the underlying interface. The fix moves marking as PACKET_OTHERHOST after the rx_handler so the rx_handler, if there is one, sees the frame unchanged, but afterwards, before the frame is delivered to the protocol stack, it gets marked whether there is an rx_handler or not. Signed-off-by: Florian Zumbiehl Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 10 ++-------- net/core/dev.c | 7 +++++-- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index add69d0fd99d..fbbf1fa00940 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -5,7 +5,7 @@ #include #include "vlan.h" -bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) +bool vlan_do_receive(struct sk_buff **skbp) { struct sk_buff *skb = *skbp; u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; @@ -13,14 +13,8 @@ bool vlan_do_receive(struct sk_buff **skbp, bool last_handler) struct vlan_pcpu_stats *rx_stats; vlan_dev = vlan_find_dev(skb->dev, vlan_id); - if (!vlan_dev) { - /* Only the last call to vlan_do_receive() should change - * pkt_type to PACKET_OTHERHOST - */ - if (vlan_id && last_handler) - skb->pkt_type = PACKET_OTHERHOST; + if (!vlan_dev) return false; - } skb = *skbp = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) diff --git a/net/core/dev.c b/net/core/dev.c index d44668f63c88..09cb3f6dc40c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3300,18 +3300,18 @@ ncls: && !skb_pfmemalloc_protocol(skb)) goto drop; - rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } - if (vlan_do_receive(&skb, !rx_handler)) + if (vlan_do_receive(&skb)) goto another_round; else if (unlikely(!skb)) goto unlock; } + rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { if (pt_prev) { ret = deliver_skb(skb, pt_prev, orig_dev); @@ -3331,6 +3331,9 @@ ncls: } } + if (vlan_tx_nonzero_tag_present(skb)) + skb->pkt_type = PACKET_OTHERHOST; + /* deliver only exact match when indicated */ null_or_dev = deliver_exact ? skb->dev : NULL; -- cgit v1.2.3 From 863472454ce50d4ef0929c6aa738cc5d64b84679 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 8 Oct 2012 21:38:50 +0200 Subject: ipv6: gro: fix PV6_GRO_CB(skb)->proto problem It seems IPV6_GRO_CB(skb)->proto can be destroyed in skb_gro_receive() if a new skb is allocated (to serve as an anchor for frag_list) We copy NAPI_GRO_CB() only (not the IPV6 specific part) in : *NAPI_GRO_CB(nskb) = *NAPI_GRO_CB(p); So we leave IPV6_GRO_CB(nskb)->proto to 0 (fresh skb allocation) instead of IPPROTO_TCP (6) ipv6_gro_complete() isnt able to call ops->gro_complete() [ tcp6_gro_complete() ] Fix this by moving proto in NAPI_GRO_CB() and getting rid of IPV6_GRO_CB Signed-off-by: Eric Dumazet Cc: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f757e3b7cfbf..a974247a9ae4 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -822,13 +822,6 @@ out: return segs; } -struct ipv6_gro_cb { - struct napi_gro_cb napi; - int proto; -}; - -#define IPV6_GRO_CB(skb) ((struct ipv6_gro_cb *)(skb)->cb) - static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, struct sk_buff *skb) { @@ -874,7 +867,7 @@ static struct sk_buff **ipv6_gro_receive(struct sk_buff **head, iph = ipv6_hdr(skb); } - IPV6_GRO_CB(skb)->proto = proto; + NAPI_GRO_CB(skb)->proto = proto; flush--; nlen = skb_network_header_len(skb); @@ -930,7 +923,7 @@ static int ipv6_gro_complete(struct sk_buff *skb) sizeof(*iph)); rcu_read_lock(); - ops = rcu_dereference(inet6_protos[IPV6_GRO_CB(skb)->proto]); + ops = rcu_dereference(inet6_protos[NAPI_GRO_CB(skb)->proto]); if (WARN_ON(!ops || !ops->gro_complete)) goto out_unlock; -- cgit v1.2.3 From e81da0e113a1b7fc7449ae6213f65f89ccac6d06 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:15 +0000 Subject: ipv4: fix sending of redirects After "Cache input routes in fib_info nexthops" (commit d2d68ba9fe) and "Elide fib_validate_source() completely when possible" (commit 7a9bc9b81a) we can not send ICMP redirects. It seems we should not cache the RTCF_DOREDIRECT flag in nh_rth_input because the same fib_info can be used for traffic that is not redirected, eg. from other input devices or from sources that are not in same subnet. As result, we have to disable the caching of RTCF_DOREDIRECT flag and to force source validation for the case when forwarding traffic to the input device. If traffic comes from directly connected source we allow redirection as it was done before both changes. Avoid setting RTCF_DOREDIRECT if IN_DEV_TX_REDIRECTS is disabled, this can avoid source address validation and to help caching the routes. After the change "Adjust semantics of rt->rt_gateway" (commit f8126f1d51) we should make sure our ICMP_REDIR_HOST messages contain daddr instead of 0.0.0.0 when target is directly connected. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 3 ++- net/ipv4/route.c | 30 ++++++++++++++++-------------- 2 files changed, 18 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 68c93d1bb03a..825c608826de 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -322,7 +322,8 @@ int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, { int r = secpath_exists(skb) ? 0 : IN_DEV_RPFILTER(idev); - if (!r && !fib_num_tclassid_users(dev_net(dev))) { + if (!r && !fib_num_tclassid_users(dev_net(dev)) && + (dev->ifindex != oif || !IN_DEV_TX_REDIRECTS(idev))) { *itag = 0; return 0; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 132e0dfee53a..b90da1bc2704 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -802,7 +802,8 @@ void ip_rt_send_redirect(struct sk_buff *skb) net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); if (!peer) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, + rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } @@ -827,7 +828,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->rate_tokens)))) { - icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); + + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE @@ -835,7 +838,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) peer->rate_tokens == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), - &ip_hdr(skb)->daddr, &rt->rt_gateway); + &ip_hdr(skb)->daddr, &gw); #endif } out_put_peer: @@ -1442,10 +1445,13 @@ static int __mkroute_input(struct sk_buff *skb, goto cleanup; } - if (out_dev == in_dev && err && + do_cache = res->fi && !itag; + if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && (IN_DEV_SHARED_MEDIA(out_dev) || - inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) { flags |= RTCF_DOREDIRECT; + do_cache = false; + } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is @@ -1462,15 +1468,11 @@ static int __mkroute_input(struct sk_buff *skb, } } - do_cache = false; - if (res->fi) { - if (!itag) { - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); - if (rt_cache_valid(rth)) { - skb_dst_set_noref(skb, &rth->dst); - goto out; - } - do_cache = true; + if (do_cache) { + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); + if (rt_cache_valid(rth)) { + skb_dst_set_noref(skb, &rth->dst); + goto out; } } -- cgit v1.2.3 From e0adef0f7456d5d3a3bfe8ea61c7dddf146b40e1 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:16 +0000 Subject: ipv4: fix forwarding for strict source routes After the change "Adjust semantics of rt->rt_gateway" (commit f8126f1d51) rt_gateway can be 0 but ip_forward() compares it directly with nexthop. What we want here is to check if traffic is to directly connected nexthop and to fail if using gateway. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/ip_forward.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index ab09b126423c..7f35ac26a71a 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) + if (opt->is_strictroute && rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && -- cgit v1.2.3 From f8a17175c63fd3e8b573719f7538816f8c96abf4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:17 +0000 Subject: ipv4: make sure nh_pcpu_rth_output is always allocated Avoid checking nh_pcpu_rth_output in fast path, abort fib_info creation on alloc_percpu failure. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/fib_semantics.c | 2 ++ net/ipv4/route.c | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 267753060ffc..71b125cd5db1 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -840,6 +840,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) change_nexthops(fi) { nexthop_nh->nh_parent = fi; nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *); + if (!nexthop_nh->nh_pcpu_rth_output) + goto failure; } endfor_nexthops(fi) if (cfg->fc_mx) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b90da1bc2704..5b0180f11b20 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1207,8 +1207,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (rt_is_input_route(rt)) { p = (struct rtable **)&nh->nh_rth_input; } else { - if (!nh->nh_pcpu_rth_output) - goto nocache; p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); } orig = *p; @@ -1223,7 +1221,6 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) * unsuccessful at storing this route into the cache * we really need to set it. */ -nocache: rt->dst.flags |= DST_NOCACHE; ret = false; } -- cgit v1.2.3 From 155e8336c373d14d87a7f91e356d85ef4b93b8f9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:18 +0000 Subject: ipv4: introduce rt_uses_gateway Add new flag to remember when route is via gateway. We will use it to allow rt_gateway to contain address of directly connected host for the cases when DST_NOCACHE is used or when the NH exception caches per-destination route without DST_NOCACHE flag, i.e. when routes are not used for other destinations. By this way we force the neighbour resolving to work with the routed destination but we can use different address in the packet, feature needed for IPVS-DR where original packet for virtual IP is routed via route to real IP. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_output.c | 4 ++-- net/ipv4/route.c | 48 ++++++++++++++++++++++------------------- net/ipv4/xfrm4_policy.c | 1 + 5 files changed, 32 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f0c5b9c1a957..d34ce2972c8f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -406,7 +406,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; return &rt->dst; @@ -442,7 +442,7 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk, rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) goto no_route; - if (opt && opt->opt.is_strictroute && rt->rt_gateway) + if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway) goto route_err; rcu_read_unlock(); return &rt->dst; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 7f35ac26a71a..694de3b7aebf 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -85,7 +85,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && rt->rt_gateway) + if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 24a29a39e9a8..6537a408a4fb 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -193,7 +193,7 @@ static inline int ip_finish_output2(struct sk_buff *skb) } rcu_read_lock_bh(); - nexthop = rt->rt_gateway ? rt->rt_gateway : ip_hdr(skb)->daddr; + nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); neigh = __ipv4_neigh_lookup_noref(dev, nexthop); if (unlikely(!neigh)) neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); @@ -371,7 +371,7 @@ int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl) skb_dst_set_noref(skb, &rt->dst); packet_routed: - if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_gateway) + if (inet_opt && inet_opt->opt.is_strictroute && rt->rt_uses_gateway) goto no_route; /* OK, we know where to send it, allocate and build IP header. */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5b0180f11b20..3a116cb0991a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1126,7 +1126,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) mtu = dst->dev->mtu; if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { - if (rt->rt_gateway && mtu > 576) + if (rt->rt_uses_gateway && mtu > 576) mtu = 576; } @@ -1177,7 +1177,9 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_gateway = fnhe->fnhe_gw; - } + rt->rt_uses_gateway = 1; + } else if (!rt->rt_gateway) + rt->rt_gateway = daddr; orig = rcu_dereference(fnhe->fnhe_rth); rcu_assign_pointer(fnhe->fnhe_rth, rt); @@ -1186,13 +1188,6 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, fnhe->fnhe_stamp = jiffies; ret = true; - } else { - /* Routes we intend to cache in nexthop exception have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; } spin_unlock_bh(&fnhe_lock); @@ -1215,15 +1210,8 @@ static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) if (prev == orig) { if (orig) rt_free(orig); - } else { - /* Routes we intend to cache in the FIB nexthop have - * the DST_NOCACHE bit clear. However, if we are - * unsuccessful at storing this route into the cache - * we really need to set it. - */ - rt->dst.flags |= DST_NOCACHE; + } else ret = false; - } return ret; } @@ -1284,8 +1272,10 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, if (fi) { struct fib_nh *nh = &FIB_RES_NH(*res); - if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) + if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { rt->rt_gateway = nh->nh_gw; + rt->rt_uses_gateway = 1; + } dst_init_metrics(&rt->dst, fi->fib_metrics, true); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; @@ -1294,8 +1284,18 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, cached = rt_bind_exception(rt, fnhe, daddr); else if (!(rt->dst.flags & DST_NOCACHE)) cached = rt_cache_route(nh, rt); - } - if (unlikely(!cached)) + if (unlikely(!cached)) { + /* Routes we intend to cache in nexthop exception or + * FIB nexthop have the DST_NOCACHE bit clear. + * However, if we are unsuccessful at storing this + * route into the cache we really need to set it. + */ + rt->dst.flags |= DST_NOCACHE; + if (!rt->rt_gateway) + rt->rt_gateway = daddr; + rt_add_uncached_list(rt); + } + } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID @@ -1363,6 +1363,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); if (our) { rth->dst.input= ip_local_deliver; @@ -1432,7 +1433,6 @@ static int __mkroute_input(struct sk_buff *skb, return -EINVAL; } - err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { @@ -1488,6 +1488,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); rth->dst.input = ip_forward; @@ -1658,6 +1659,7 @@ local_input: rth->rt_iif = 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; @@ -1826,6 +1828,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, rth->rt_iif = orig_oif ? : 0; rth->rt_pmtu = 0; rth->rt_gateway = 0; + rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); RT_CACHE_STAT_INC(out_slow_tot); @@ -2104,6 +2107,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_gateway = ort->rt_gateway; + rt->rt_uses_gateway = ort->rt_uses_gateway; INIT_LIST_HEAD(&rt->rt_uncached); @@ -2182,7 +2186,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } - if (rt->rt_gateway && + if (rt->rt_uses_gateway && nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) goto nla_put_failure; diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 681ea2f413e2..05c5ab8d983c 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -91,6 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, RTCF_LOCAL); xdst->u.rt.rt_type = rt->rt_type; xdst->u.rt.rt_gateway = rt->rt_gateway; + xdst->u.rt.rt_uses_gateway = rt->rt_uses_gateway; xdst->u.rt.rt_pmtu = rt->rt_pmtu; INIT_LIST_HEAD(&xdst->u.rt.rt_uncached); -- cgit v1.2.3 From c92b96553a80c1dbe2ebe128bbe37c8f98f148bf Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:19 +0000 Subject: ipv4: Add FLOWI_FLAG_KNOWN_NH Add flag to request that output route should be returned with known rt_gateway, in case we want to use it as nexthop for neighbour resolving. The returned route can be cached as follows: - in NH exception: because the cached routes are not shared with other destinations - in FIB NH: when using gateway because all destinations for NH share same gateway As last option, to return rt_gateway!=0 we have to set DST_NOCACHE. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/ipv4/route.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3a116cb0991a..1a0da8dc8180 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1762,6 +1762,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, struct in_device *in_dev; u16 type = res->type; struct rtable *rth; + bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) @@ -1798,24 +1799,36 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } fnhe = NULL; + do_cache = fi != NULL; if (fi) { struct rtable __rcu **prth; + struct fib_nh *nh = &FIB_RES_NH(*res); - fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); + fnhe = find_exception(nh, fl4->daddr); if (fnhe) prth = &fnhe->fnhe_rth; - else - prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); + else { + if (unlikely(fl4->flowi4_flags & + FLOWI_FLAG_KNOWN_NH && + !(nh->nh_gw && + nh->nh_scope == RT_SCOPE_LINK))) { + do_cache = false; + goto add; + } + prth = __this_cpu_ptr(nh->nh_pcpu_rth_output); + } rth = rcu_dereference(*prth); if (rt_cache_valid(rth)) { dst_hold(&rth->dst); return rth; } } + +add: rth = rt_dst_alloc(dev_out, IN_DEV_CONF_GET(in_dev, NOPOLICY), IN_DEV_CONF_GET(in_dev, NOXFRM), - fi); + do_cache); if (!rth) return ERR_PTR(-ENOBUFS); -- cgit v1.2.3 From ad4d3ef8b7eb527cca478dc08c02c10936e64115 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 8 Oct 2012 11:41:20 +0000 Subject: ipvs: fix ARP resolving for direct routing mode After the change "Make neigh lookups directly in output packet path" (commit a263b30936) IPVS can not reach the real server for DR mode because we resolve the destination address from IP header, not from route neighbour. Use the new FLOWI_FLAG_KNOWN_NH flag to request output routes with known nexthop, so that it has preference on resolving. Signed-off-by: Julian Anastasov Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_xmit.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 56f6d5d81a77..cc4c8095681a 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -50,6 +50,7 @@ enum { * local */ IP_VS_RT_MODE_CONNECT = 8, /* Always bind route to saddr */ + IP_VS_RT_MODE_KNOWN_NH = 16,/* Route via remote addr */ }; /* @@ -113,6 +114,8 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, fl4.daddr = daddr; fl4.saddr = (rt_mode & IP_VS_RT_MODE_CONNECT) ? *saddr : 0; fl4.flowi4_tos = rtos; + fl4.flowi4_flags = (rt_mode & IP_VS_RT_MODE_KNOWN_NH) ? + FLOWI_FLAG_KNOWN_NH : 0; retry: rt = ip_route_output_key(net, &fl4); @@ -1061,7 +1064,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(iph->tos), IP_VS_RT_MODE_LOCAL | - IP_VS_RT_MODE_NON_LOCAL, NULL))) + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_KNOWN_NH, NULL))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); -- cgit v1.2.3 From 4c199a93a2d36b277a9fd209a0f2793f8460a215 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Oct 2012 16:30:32 -0700 Subject: rbtree: empty nodes have no color Empty nodes have no color. We can make use of this property to simplify the code emitted by the RB_EMPTY_NODE and RB_CLEAR_NODE macros. Also, we can get rid of the rb_init_node function which had been introduced by commit 88d19cf37952 ("timers: Add rb_init_node() to allow for stack allocated rb nodes") to avoid some issue with the empty node's color not being initialized. I'm not sure what the RB_EMPTY_NODE checks in rb_prev() / rb_next() are doing there, though. axboe introduced them in commit 10fd48f2376d ("rbtree: fixed reversed RB_EMPTY_NODE and rb_next/prev"). The way I see it, the 'empty node' abstraction is only used by rbtree users to flag nodes that they haven't inserted in any rbtree, so asking the predecessor or successor of such nodes doesn't make any sense. One final rb_init_node() caller was recently added in sysctl code to implement faster sysctl name lookups. This code doesn't make use of RB_EMPTY_NODE at all, and from what I could see it only called rb_init_node() under the mistaken assumption that such initialization was required before node insertion. [sfr@canb.auug.org.au: fix net/ceph/osd_client.c build] Signed-off-by: Michel Lespinasse Cc: Andrea Arcangeli Acked-by: David Woodhouse Cc: Rik van Riel Cc: Peter Zijlstra Cc: Daniel Santos Cc: Jens Axboe Cc: "Eric W. Biederman" Cc: John Stultz Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/ceph/osd_client.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ccbdfbba9e53..c1d756cc7448 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -221,7 +221,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, kref_init(&req->r_kref); init_completion(&req->r_completion); init_completion(&req->r_safe_completion); - rb_init_node(&req->r_node); INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_linger_item); INIT_LIST_HEAD(&req->r_linger_osd); -- cgit v1.2.3 From 5175a5e76bbdf20a614fb47ce7a38f0f39e70226 Mon Sep 17 00:00:00 2001 From: "jeff.liu" Date: Mon, 8 Oct 2012 18:57:27 +0000 Subject: RDS: fix rds-ping spinlock recursion This is the revised patch for fixing rds-ping spinlock recursion according to Venkat's suggestions. RDS ping/pong over TCP feature has been broken for years(2.6.39 to 3.6.0) since we have to set TCP cork and call kernel_sendmsg() between ping/pong which both need to lock "struct sock *sk". However, this lock has already been hold before rds_tcp_data_ready() callback is triggerred. As a result, we always facing spinlock resursion which would resulting in system panic. Given that RDS ping is only used to test the connectivity and not for serious performance measurements, we can queue the pong transmit to rds_wq as a delayed response. Reported-by: Dan Carpenter CC: Venkat Venkatsubra CC: David S. Miller CC: James Morris Signed-off-by: Jie Liu Signed-off-by: David S. Miller --- net/rds/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 96531d4033a2..88eace57dd6b 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1122,7 +1122,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_pong); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_xmit(conn); + queue_delayed_work(rds_wq, &conn->c_send_w, 0); rds_message_put(rm); return 0; -- cgit v1.2.3 From 5aa8b572007c4bca1e6d3dd4c4820f1ae49d6bb2 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:16 +0000 Subject: pktgen: fix crash when generating IPv6 packets For IPv6, sizeof(struct ipv6hdr) = 40, thus the following expression will result negative: datalen = pkt_dev->cur_pkt_size - 14 - sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; And, the check "if (datalen < sizeof(struct pktgen_hdr))" will be passed as "datalen" is promoted to unsigned, therefore will cause a crash later. This is a quick fix by checking if "datalen" is negative. The following patch will increase the default value of 'min_pkt_size' for IPv6. This bug should exist for a long time, so Cc -stable too. Cc: Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 148e73d2c451..e356b8d52bad 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2927,7 +2927,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, sizeof(struct ipv6hdr) - sizeof(struct udphdr) - pkt_dev->pkt_overhead; - if (datalen < sizeof(struct pktgen_hdr)) { + if (datalen < 0 || datalen < sizeof(struct pktgen_hdr)) { datalen = sizeof(struct pktgen_hdr); net_info_ratelimited("increased datalen to %d\n", datalen); } -- cgit v1.2.3 From 68bf9f0b91ed4440951312cf7d4ffa70b9e8cf73 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:17 +0000 Subject: pktgen: set different default min_pkt_size for different protocols ETH_ZLEN is too small for IPv6, so this default value is not suitable. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index e356b8d52bad..98ee54963553 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -248,8 +248,8 @@ struct pktgen_dev { int removal_mark; /* non-zero => the device is marked for * removal by worker thread */ - int min_pkt_size; /* = ETH_ZLEN; */ - int max_pkt_size; /* = ETH_ZLEN; */ + int min_pkt_size; + int max_pkt_size; int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ int nfrags; struct page *page; @@ -2036,10 +2036,14 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) /* Set up Dest MAC */ memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); - /* Set up pkt size */ - pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; - if (pkt_dev->flags & F_IPV6) { + if (pkt_dev->min_pkt_size == 0) { + pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr) + + sizeof(struct udphdr) + + sizeof(struct pktgen_hdr) + + pkt_dev->pkt_overhead; + } + /* * Skip this automatic address setting until locks or functions * gets exported @@ -2086,6 +2090,13 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) } #endif } else { + if (pkt_dev->min_pkt_size == 0) { + pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr) + + sizeof(struct udphdr) + + sizeof(struct pktgen_hdr) + + pkt_dev->pkt_overhead; + } + pkt_dev->saddr_min = 0; pkt_dev->saddr_max = 0; if (strlen(pkt_dev->src_min) == 0) { @@ -2111,6 +2122,10 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); } /* Initialize current values. */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + if (pkt_dev->min_pkt_size > pkt_dev->max_pkt_size) + pkt_dev->max_pkt_size = pkt_dev->min_pkt_size; + pkt_dev->cur_dst_mac_offset = 0; pkt_dev->cur_src_mac_offset = 0; pkt_dev->cur_saddr = pkt_dev->saddr_min; @@ -3548,8 +3563,6 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) } pkt_dev->removal_mark = 0; - pkt_dev->min_pkt_size = ETH_ZLEN; - pkt_dev->max_pkt_size = ETH_ZLEN; pkt_dev->nfrags = 0; pkt_dev->delay = pg_delay_d; pkt_dev->count = pg_count_d; -- cgit v1.2.3 From 0373a94671be1f5c823dbfc03617418d8effd5ce Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:18 +0000 Subject: pktgen: display IPv4 address in human-readable format It is weird to display IPv4 address in %x format, what's more, IPv6 address is disaplayed in human-readable format too. So, make it human-readable. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 98ee54963553..f9b4637e9f24 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -702,8 +702,8 @@ static int pktgen_if_show(struct seq_file *seq, void *v) &pkt_dev->cur_in6_saddr, &pkt_dev->cur_in6_daddr); } else - seq_printf(seq, " cur_saddr: 0x%x cur_daddr: 0x%x\n", - pkt_dev->cur_saddr, pkt_dev->cur_daddr); + seq_printf(seq, " cur_saddr: %pI4 cur_daddr: %pI4\n", + &pkt_dev->cur_saddr, &pkt_dev->cur_daddr); seq_printf(seq, " cur_udp_dst: %d cur_udp_src: %d\n", pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); -- cgit v1.2.3 From 4c139b8ccebaecdfad58eb068d61ef386f1a58ed Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:19 +0000 Subject: pktgen: enable automatic IPv6 address setting Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f9b4637e9f24..47fe18e6a8e4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2037,6 +2037,9 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, ETH_ALEN); if (pkt_dev->flags & F_IPV6) { + int i, set = 0, err = 1; + struct inet6_dev *idev; + if (pkt_dev->min_pkt_size == 0) { pkt_dev->min_pkt_size = 14 + sizeof(struct ipv6hdr) + sizeof(struct udphdr) @@ -2044,15 +2047,6 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) + pkt_dev->pkt_overhead; } - /* - * Skip this automatic address setting until locks or functions - * gets exported - */ - -#ifdef NOTNOW - int i, set = 0, err = 1; - struct inet6_dev *idev; - for (i = 0; i < IN6_ADDR_HSIZE; i++) if (pkt_dev->cur_in6_saddr.s6_addr[i]) { set = 1; @@ -2073,9 +2067,8 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) struct inet6_ifaddr *ifp; read_lock_bh(&idev->lock); - for (ifp = idev->addr_list; ifp; - ifp = ifp->if_next) { - if (ifp->scope == IFA_LINK && + list_for_each_entry(ifp, &idev->addr_list, if_list) { + if ((ifp->scope & IFA_LINK) && !(ifp->flags & IFA_F_TENTATIVE)) { pkt_dev->cur_in6_saddr = ifp->addr; err = 0; @@ -2088,7 +2081,6 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) if (err) pr_err("ERROR: IPv6 link address not available\n"); } -#endif } else { if (pkt_dev->min_pkt_size == 0) { pkt_dev->min_pkt_size = 14 + sizeof(struct iphdr) -- cgit v1.2.3 From c468fb1375f1b4de851e3e0dbe9d1293d414a160 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Tue, 9 Oct 2012 17:48:20 +0000 Subject: pktgen: replace scan_ip6() with in6_pton() Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/pktgen.c | 101 +++--------------------------------------------------- 1 file changed, 4 insertions(+), 97 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 47fe18e6a8e4..d1dc14c2aac4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -449,8 +449,6 @@ static void pktgen_stop_all_threads_ifs(void); static void pktgen_stop(struct pktgen_thread *t); static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); -static unsigned int scan_ip6(const char *s, char ip[16]); - /* Module parameters, defaults. */ static int pg_count_d __read_mostly = 1000; static int pg_delay_d __read_mostly; @@ -1299,7 +1297,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); + in6_pton(buf, -1, pkt_dev->in6_daddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr); pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr; @@ -1322,7 +1320,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + in6_pton(buf, -1, pkt_dev->min_in6_daddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr); pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr; @@ -1344,7 +1342,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + in6_pton(buf, -1, pkt_dev->max_in6_daddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->max_in6_daddr); if (debug) @@ -1365,7 +1363,7 @@ static ssize_t pktgen_if_write(struct file *file, return -EFAULT; buf[len] = 0; - scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); + in6_pton(buf, -1, pkt_dev->in6_saddr.s6_addr, -1, NULL); snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr); pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr; @@ -2765,97 +2763,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, return skb; } -/* - * scan_ip6, fmt_ip taken from dietlibc-0.21 - * Author Felix von Leitner - * - * Slightly modified for kernel. - * Should be candidate for net/ipv4/utils.c - * --ro - */ - -static unsigned int scan_ip6(const char *s, char ip[16]) -{ - unsigned int i; - unsigned int len = 0; - unsigned long u; - char suffix[16]; - unsigned int prefixlen = 0; - unsigned int suffixlen = 0; - __be32 tmp; - char *pos; - - for (i = 0; i < 16; i++) - ip[i] = 0; - - for (;;) { - if (*s == ':') { - len++; - if (s[1] == ':') { /* Found "::", skip to part 2 */ - s += 2; - len++; - break; - } - s++; - } - - u = simple_strtoul(s, &pos, 16); - i = pos - s; - if (!i) - return 0; - if (prefixlen == 12 && s[i] == '.') { - - /* the last 4 bytes may be written as IPv4 address */ - - tmp = in_aton(s); - memcpy((struct in_addr *)(ip + 12), &tmp, sizeof(tmp)); - return i + len; - } - ip[prefixlen++] = (u >> 8); - ip[prefixlen++] = (u & 255); - s += i; - len += i; - if (prefixlen == 16) - return len; - } - -/* part 2, after "::" */ - for (;;) { - if (*s == ':') { - if (suffixlen == 0) - break; - s++; - len++; - } else if (suffixlen != 0) - break; - - u = simple_strtol(s, &pos, 16); - i = pos - s; - if (!i) { - if (*s) - len--; - break; - } - if (suffixlen + prefixlen <= 12 && s[i] == '.') { - tmp = in_aton(s); - memcpy((struct in_addr *)(suffix + suffixlen), &tmp, - sizeof(tmp)); - suffixlen += 4; - len += strlen(s); - break; - } - suffix[suffixlen++] = (u >> 8); - suffix[suffixlen++] = (u & 255); - s += i; - len += i; - if (prefixlen + suffixlen == 16) - break; - } - for (i = 0; i < suffixlen; i++) - ip[16 - suffixlen + i] = suffix[i]; - return len; -} - static struct sk_buff *fill_packet_ipv6(struct net_device *odev, struct pktgen_dev *pkt_dev) { -- cgit v1.2.3 From 6caab7b0544e83e6c160b5e80f5a4a7dd69545c7 Mon Sep 17 00:00:00 2001 From: Sarveshwar Bandi Date: Wed, 10 Oct 2012 01:15:01 +0000 Subject: bridge: Pull ip header into skb->data before looking into ip header. If lower layer driver leaves the ip header in the skb fragment, it needs to be first pulled into skb->data before inspecting ip header length or ip version number. Signed-off-by: Sarveshwar Bandi Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 68e8f364bbf8..fe43bc7b063f 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -265,6 +265,9 @@ static int br_parse_ip_options(struct sk_buff *skb) struct net_device *dev = skb->dev; u32 len; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + iph = ip_hdr(skb); opt = &(IPCB(skb)->opt); -- cgit v1.2.3 From 68aaed54e7682aef261d5c2cf99e85a9dbf33a84 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 10 Oct 2012 08:27:25 +0000 Subject: ipv4: fix route mark sparse warning Sparse complains about RTA_MARK which is should be host order according to include file and usage in iproute. net/ipv4/route.c:2223:46: warning: incorrect type in argument 3 (different base types) net/ipv4/route.c:2223:46: expected restricted __be32 [usertype] value net/ipv4/route.c:2223:46: got unsigned int [unsigned] [usertype] flowic_mark Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1a0da8dc8180..432f4bb77238 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2220,7 +2220,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, goto nla_put_failure; if (fl4->flowi4_mark && - nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark)) + nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; error = rt->dst.error; -- cgit v1.2.3 From 759f42987f98915764bad922ee123acb0eadbe33 Mon Sep 17 00:00:00 2001 From: Simon Derr Date: Mon, 17 Sep 2012 15:16:31 +0200 Subject: 9P: Fix race between p9_write_work() and p9_fd_request() Race scenario: thread A thread B p9_write_work() p9_fd_request() if (list_empty (&m->unsent_req_list)) ... spin_lock(&client->lock); req->status = REQ_STATUS_UNSENT; list_add_tail(..., &m->unsent_req_list); spin_unlock(&client->lock); .... if (n & POLLOUT && !test_and_set_bit(Wworksched, &m->wsched) schedule_work(&m->wq); --> not done because Wworksched is set clear_bit(Wworksched, &m->wsched); return; --> nobody will take care of sending the new request. This is not very likely to happen though, because p9_write_work() being called with an empty unsent_req_list is not frequent. But this also means that taking the lock earlier will not be costly. Signed-off-by: Simon Derr Signed-off-by: Eric Van Hensbergen --- net/9p/trans_fd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index b2c308fffb8a..0031a8cf145d 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -453,12 +453,13 @@ static void p9_write_work(struct work_struct *work) } if (!m->wsize) { + spin_lock(&m->client->lock); if (list_empty(&m->unsent_req_list)) { clear_bit(Wworksched, &m->wsched); + spin_unlock(&m->client->lock); return; } - spin_lock(&m->client->lock); req = list_entry(m->unsent_req_list.next, struct p9_req_t, req_list); req->status = REQ_STATUS_SENT; -- cgit v1.2.3 From 0e24c4fc52b16f0a1102a933f636d2f350c41098 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 11 Oct 2012 06:24:14 +0000 Subject: tcp: sysctl interface leaks 16 bytes of kernel memory If the rc_dereference of tcp_fastopen_ctx ever fails then we copy 16 bytes of kernel stack into the proc result. Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- net/ipv4/sysctl_net_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9205e492dc9d..63d4eccc674d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -248,6 +248,8 @@ int proc_tcp_fastopen_key(ctl_table *ctl, int write, void __user *buffer, ctxt = rcu_dereference(tcp_fastopen_ctx); if (ctxt) memcpy(user_key, ctxt->key, TCP_FASTOPEN_KEY_LENGTH); + else + memset(user_key, 0, sizeof(user_key)); rcu_read_unlock(); snprintf(tbl.data, tbl.maxlen, "%08x-%08x-%08x-%08x", -- cgit v1.2.3 From 4c67525849e0b7f4bd4fab2487ec9e43ea52ef29 Mon Sep 17 00:00:00 2001 From: Alexey Kuznetsov Date: Fri, 12 Oct 2012 04:34:17 +0000 Subject: tcp: resets are misrouted After commit e2446eaa ("tcp_v4_send_reset: binding oif to iif in no sock case").. tcp resets are always lost, when routing is asymmetric. Yes, backing out that patch will result in misrouting of resets for dead connections which used interface binding when were alive, but we actually cannot do anything here. What's died that's died and correct handling normal unbound connections is obviously a priority. Comment to comment: > This has few benefits: > 1. tcp_v6_send_reset already did that. It was done to route resets for IPv6 link local addresses. It was a mistake to do so for global addresses. The patch fixes this as well. Actually, the problem appears to be even more serious than guaranteed loss of resets. As reported by Sergey Soloviev , those misrouted resets create a lot of arp traffic and huge amount of unresolved arp entires putting down to knees NAT firewalls which use asymmetric routing. Signed-off-by: Alexey Kuznetsov --- net/ipv4/tcp_ipv4.c | 7 ++++--- net/ipv6/tcp_ipv6.c | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 75735c9a6a9d..ef998b008a57 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -708,10 +708,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; /* When socket is gone, all binding information is lost. - * routing might fail in this case. using iif for oif to - * make sure we can deliver it + * routing might fail in this case. No choice here, if we choose to force + * input interface, we will misroute in case of asymmetric route. */ - arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); + if (sk) + arg.bound_dev_if = sk->sk_bound_dev_if; net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 49c890386ce9..26175bffbaa0 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -877,7 +877,8 @@ static void tcp_v6_send_response(struct sk_buff *skb, u32 seq, u32 ack, u32 win, __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; - fl6.flowi6_oif = inet6_iif(skb); + if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL) + fl6.flowi6_oif = inet6_iif(skb); fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; security_skb_classify_flow(skb, flowi6_to_flowi(&fl6)); -- cgit v1.2.3 From 8437e7610c2d3e06f87f71fb82e10ed4b291812a Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 11 Oct 2012 12:51:28 +0000 Subject: vti: fix sparse bit endian warnings Use be32_to_cpu instead of htonl to keep sparse happy. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/ip_vti.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 978bca4818ae..1831092f999f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -374,7 +374,7 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) memset(&fl4, 0, sizeof(fl4)); flowi4_init_output(&fl4, tunnel->parms.link, - htonl(tunnel->parms.i_key), RT_TOS(tos), + be32_to_cpu(tunnel->parms.i_key), RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPIP, 0, dst, tiph->saddr, 0, 0); @@ -441,7 +441,7 @@ static int vti_tunnel_bind_dev(struct net_device *dev) struct flowi4 fl4; memset(&fl4, 0, sizeof(fl4)); flowi4_init_output(&fl4, tunnel->parms.link, - htonl(tunnel->parms.i_key), + be32_to_cpu(tunnel->parms.i_key), RT_TOS(iph->tos), RT_SCOPE_UNIVERSE, IPPROTO_IPIP, 0, iph->daddr, iph->saddr, 0, 0); -- cgit v1.2.3 From 28194fcdc150e4d5b418d01db3c29058f60ef32c Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 11 Oct 2012 21:06:16 +0000 Subject: net: add doc for in6_pton() It is not easy to use in6_pton() correctly without reading its definition, so add some doc for it. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/utils.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index f5613d569c23..30f3879faecc 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -161,6 +161,18 @@ out: } EXPORT_SYMBOL(in4_pton); +/** + * in6_pton - convert an IPv6 address from literal to binary representation + * @src: the start of the IPv6 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[16] array) representation of the IPv6 address + * @delim: the delimiter of the IPv6 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */ int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end) -- cgit v1.2.3 From 93ac0ef016a1b223d23fbb5e0397cab75a8f7d34 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 11 Oct 2012 21:06:17 +0000 Subject: net: add doc for in4_pton() It is not easy to use in4_pton() correctly without reading its definition, so add some doc for it. Cc: David S. Miller Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- net/core/utils.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/core/utils.c b/net/core/utils.c index 30f3879faecc..e3487e461939 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -107,6 +107,18 @@ static inline int xdigit2bin(char c, int delim) return IN6PTON_UNKNOWN; } +/** + * in4_pton - convert an IPv4 address from literal to binary representation + * @src: the start of the IPv4 address string + * @srclen: the length of the string, -1 means strlen(src) + * @dst: the binary (u8[4] array) representation of the IPv4 address + * @delim: the delimiter of the IPv4 address in @src, -1 means no delimiter + * @end: A pointer to the end of the parsed string will be placed here + * + * Return one on success, return zero when any error occurs + * and @end will point to the end of the parsed string. + * + */ int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end) -- cgit v1.2.3 From 1bbb3095a5912be4b9c90397ef2182a5a328865b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 3 Oct 2012 20:32:17 -0700 Subject: userns: Properly print bluetooth socket uids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With user namespace support enabled building bluetooth generated the warning. net/bluetooth/af_bluetooth.c: In function ‘bt_seq_show’: net/bluetooth/af_bluetooth.c:598:7: warning: format ‘%u’ expects argument of type ‘unsigned int’, but argument 7 has type ‘kuid_t’ [-Wformat] Convert sock_i_uid from a kuid_t to a uid_t before printing, to avoid this problem. Reported-by: Fengguang Wu Cc: Masatake YAMATO Cc: Gustavo Padovan Signed-off-by: "Eric W. Biederman" --- net/bluetooth/af_bluetooth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 9d49ee6d7219..ba033f09196e 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -591,7 +591,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) atomic_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), - sock_i_uid(sk), + from_kuid(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), &src_baswapped, &dst_baswapped, -- cgit v1.2.3 From 55462cf30ad9768fff6a6d36db21879146a39bdf Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Sun, 14 Oct 2012 04:30:56 +0000 Subject: vlan: fix bond/team enslave of vlan challenged slave/port In vlan_uses_dev() check for number of vlan devs rather than existence of vlan_info. The reason is that vlan id 0 is there without appropriate vlan dev on it by default which prevented from enslaving vlan challenged dev. Reported-by: Jon Stanley Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index fbbf1fa00940..65e06abe023f 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -366,6 +366,13 @@ EXPORT_SYMBOL(vlan_vids_del_by_dev); bool vlan_uses_dev(const struct net_device *dev) { - return rtnl_dereference(dev->vlan_info) ? true : false; + struct vlan_info *vlan_info; + + ASSERT_RTNL(); + + vlan_info = rtnl_dereference(dev->vlan_info); + if (!vlan_info) + return false; + return vlan_info->grp.nr_vlan_devs ? true : false; } EXPORT_SYMBOL(vlan_uses_dev); -- cgit v1.2.3 From f6e80abeab928b7c47cc1fbf53df13b4398a2bec Mon Sep 17 00:00:00 2001 From: Zijie Pan Date: Mon, 15 Oct 2012 03:56:39 +0000 Subject: sctp: fix call to SCTP_CMD_PROCESS_SACK in sctp_cmd_interpreter() Bug introduced by commit edfee0339e681a784ebacec7e8c2dc97dc6d2839 (sctp: check src addr when processing SACK to update transport state) Signed-off-by: Zijie Pan Signed-off-by: Nicolas Dichtel Acked-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/sm_sideeffect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 57f7de839b03..6773d7803627 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -1642,8 +1642,9 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, asoc->outqueue.outstanding_bytes; sackh.num_gap_ack_blocks = 0; sackh.num_dup_tsns = 0; + chunk->subh.sack_hdr = &sackh; sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, - SCTP_SACKH(&sackh)); + SCTP_CHUNK(chunk)); break; case SCTP_CMD_DISCARD_PACKET: -- cgit v1.2.3 From 9f0d3c2781baa1102108e16efbe640dd74564a7c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Oct 2012 07:37:27 +0000 Subject: ipv6: addrconf: fix /proc/net/if_inet6 Commit 1d5783030a1 (ipv6/addrconf: speedup /proc/net/if_inet6 filling) added bugs hiding some devices from if_inet6 and breaking applications. "ip -6 addr" could still display all IPv6 addresses, while "ifconfig -a" couldnt. One way to reproduce the bug is by starting in a shell : unshare -n /bin/bash ifconfig lo up And in original net namespace, lo device disappeared from if_inet6 Reported-by: Jan Hinnerk Stosch Tested-by: Jan Hinnerk Stosch Signed-off-by: Eric Dumazet Cc: Mihai Maruseac Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d7c56f8a5b4e..0424e4e27414 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3064,14 +3064,15 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) struct hlist_node *n; hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; /* sync with offset */ if (p < state->offset) { p++; continue; } state->offset++; - if (net_eq(dev_net(ifa->idev->dev), net)) - return ifa; + return ifa; } /* prepare for next bucket */ @@ -3089,18 +3090,20 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct hlist_node *n = &ifa->addr_lst; hlist_for_each_entry_continue_rcu_bh(ifa, n, addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; state->offset++; - if (net_eq(dev_net(ifa->idev->dev), net)) - return ifa; + return ifa; } while (++state->bucket < IN6_ADDR_HSIZE) { state->offset = 0; hlist_for_each_entry_rcu_bh(ifa, n, &inet6_addr_lst[state->bucket], addr_lst) { + if (!net_eq(dev_net(ifa->idev->dev), net)) + continue; state->offset++; - if (net_eq(dev_net(ifa->idev->dev), net)) - return ifa; + return ifa; } } -- cgit v1.2.3