diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/9p/Kconfig | 6 | ||||
-rw-r--r-- | net/9p/Makefile | 4 | ||||
-rw-r--r-- | net/9p/client.c | 1 | ||||
-rw-r--r-- | net/9p/protocol.c | 33 | ||||
-rw-r--r-- | net/9p/trans_fd.c | 4 | ||||
-rw-r--r-- | net/9p/trans_rdma.c | 712 | ||||
-rw-r--r-- | net/bridge/br_device.c | 2 | ||||
-rw-r--r-- | net/bridge/br_if.c | 14 | ||||
-rw-r--r-- | net/core/dev.c | 135 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 25 | ||||
-rw-r--r-- | net/sched/sch_cbq.c | 7 | ||||
-rw-r--r-- | net/sctp/input.c | 2 | ||||
-rw-r--r-- | net/sctp/sm_statefuns.c | 54 | ||||
-rw-r--r-- | net/sctp/sm_statetable.c | 4 | ||||
-rw-r--r-- | net/unix/af_unix.c | 18 | ||||
-rw-r--r-- | net/wireless/Kconfig | 11 |
16 files changed, 921 insertions, 111 deletions
diff --git a/net/9p/Kconfig b/net/9p/Kconfig index ff34c5acc130..c42c0c400bf9 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -20,6 +20,12 @@ config NET_9P_VIRTIO This builds support for a transports between guest partitions and a host partition. +config NET_9P_RDMA + depends on NET_9P && INFINIBAND && EXPERIMENTAL + tristate "9P RDMA Transport (Experimental)" + help + This builds support for a RDMA transport. + config NET_9P_DEBUG bool "Debug information" depends on NET_9P diff --git a/net/9p/Makefile b/net/9p/Makefile index 1041b7bd12e2..198a640d53a6 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_NET_9P) := 9pnet.o obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o +obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o 9pnet-objs := \ mod.o \ @@ -11,3 +12,6 @@ obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o 9pnet_virtio-objs := \ trans_virtio.o \ + +9pnet_rdma-objs := \ + trans_rdma.o \ diff --git a/net/9p/client.c b/net/9p/client.c index bbac2f72b4d2..67717f69412e 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -159,6 +159,7 @@ static struct p9_req_t *p9_tag_alloc(struct p9_client *c, u16 tag) if (!c->reqs[row]) { printk(KERN_ERR "Couldn't grow tag array\n"); + spin_unlock_irqrestore(&c->lock, flags); return ERR_PTR(-ENOMEM); } for (col = 0; col < P9_ROW_MAXTAG; col++) { diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 29be52439086..dcd7666824ba 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -53,6 +53,7 @@ static int p9pdu_writef(struct p9_fcall *pdu, int optional, const char *fmt, ...); +#ifdef CONFIG_NET_9P_DEBUG void p9pdu_dump(int way, struct p9_fcall *pdu) { @@ -81,6 +82,12 @@ p9pdu_dump(int way, struct p9_fcall *pdu) else P9_DPRINTK(P9_DEBUG_PKT, "]]](%d) %s\n", datalen, buf); } +#else +void +p9pdu_dump(int way, struct p9_fcall *pdu) +{ +} +#endif EXPORT_SYMBOL(p9pdu_dump); void p9stat_free(struct p9_wstat *stbuf) @@ -179,7 +186,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case 's':{ - char **ptr = va_arg(ap, char **); + char **sptr = va_arg(ap, char **); int16_t len; int size; @@ -189,17 +196,17 @@ p9pdu_vreadf(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) size = MAX(len, 0); - *ptr = kmalloc(size + 1, GFP_KERNEL); - if (*ptr == NULL) { + *sptr = kmalloc(size + 1, GFP_KERNEL); + if (*sptr == NULL) { errcode = -EFAULT; break; } - if (pdu_read(pdu, *ptr, size)) { + if (pdu_read(pdu, *sptr, size)) { errcode = -EFAULT; - kfree(*ptr); - *ptr = NULL; + kfree(*sptr); + *sptr = NULL; } else - (*ptr)[size] = 0; + (*sptr)[size] = 0; } break; case 'Q':{ @@ -373,13 +380,13 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) } break; case 's':{ - const char *ptr = va_arg(ap, const char *); + const char *sptr = va_arg(ap, const char *); int16_t len = 0; - if (ptr) - len = MIN(strlen(ptr), USHORT_MAX); + if (sptr) + len = MIN(strlen(sptr), USHORT_MAX); errcode = p9pdu_writef(pdu, optional, "w", len); - if (!errcode && pdu_write(pdu, ptr, len)) + if (!errcode && pdu_write(pdu, sptr, len)) errcode = -EFAULT; } break; @@ -419,7 +426,7 @@ p9pdu_vwritef(struct p9_fcall *pdu, int optional, const char *fmt, va_list ap) case 'U':{ int32_t count = va_arg(ap, int32_t); const char __user *udata = - va_arg(ap, const void *); + va_arg(ap, const void __user *); errcode = p9pdu_writef(pdu, optional, "d", count); if (!errcode && pdu_write_u(pdu, udata, count)) @@ -542,8 +549,10 @@ int p9pdu_finalize(struct p9_fcall *pdu) err = p9pdu_writef(pdu, 0, "d", size); pdu->size = size; +#ifdef CONFIG_NET_9P_DEBUG if ((p9_debug_level & P9_DEBUG_PKT) == P9_DEBUG_PKT) p9pdu_dump(0, pdu); +#endif P9_DPRINTK(P9_DEBUG_9P, ">>> size=%d type: %d tag: %d\n", pdu->size, pdu->id, pdu->tag); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index be65d8242fd2..1df0356f242b 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -678,11 +678,9 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) { - struct p9_trans_fd *ts = client->trans; - struct p9_conn *m = ts->conn; int ret = 1; - P9_DPRINTK(P9_DEBUG_TRANS, "mux %p req %p\n", m, req); + P9_DPRINTK(P9_DEBUG_TRANS, "client %p req %p\n", client, req); spin_lock(&client->lock); list_del(&req->req_list); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c new file mode 100644 index 000000000000..8d6cc4777aae --- /dev/null +++ b/net/9p/trans_rdma.c @@ -0,0 +1,712 @@ +/* + * linux/fs/9p/trans_rdma.c + * + * RDMA transport layer based on the trans_fd.c implementation. + * + * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> + * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> + * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> + * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> + * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to: + * Free Software Foundation + * 51 Franklin Street, Fifth Floor + * Boston, MA 02111-1301 USA + * + */ + +#include <linux/in.h> +#include <linux/module.h> +#include <linux/net.h> +#include <linux/ipv6.h> +#include <linux/kthread.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/un.h> +#include <linux/uaccess.h> +#include <linux/inet.h> +#include <linux/idr.h> +#include <linux/file.h> +#include <linux/parser.h> +#include <linux/semaphore.h> +#include <net/9p/9p.h> +#include <net/9p/client.h> +#include <net/9p/transport.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/ib_verbs.h> + +#define P9_PORT 5640 +#define P9_RDMA_SQ_DEPTH 32 +#define P9_RDMA_RQ_DEPTH 32 +#define P9_RDMA_SEND_SGE 4 +#define P9_RDMA_RECV_SGE 4 +#define P9_RDMA_IRD 0 +#define P9_RDMA_ORD 0 +#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ +#define P9_RDMA_MAXSIZE (4*4096) /* Min SGE is 4, so we can + * safely advertise a maxsize + * of 64k */ + +#define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT) +/** + * struct p9_trans_rdma - RDMA transport instance + * + * @state: tracks the transport state machine for connection setup and tear down + * @cm_id: The RDMA CM ID + * @pd: Protection Domain pointer + * @qp: Queue Pair pointer + * @cq: Completion Queue pointer + * @lkey: The local access only memory region key + * @timeout: Number of uSecs to wait for connection management events + * @sq_depth: The depth of the Send Queue + * @sq_sem: Semaphore for the SQ + * @rq_depth: The depth of the Receive Queue. + * @addr: The remote peer's address + * @req_lock: Protects the active request list + * @send_wait: Wait list when the SQ fills up + * @cm_done: Completion event for connection management tracking + */ +struct p9_trans_rdma { + enum { + P9_RDMA_INIT, + P9_RDMA_ADDR_RESOLVED, + P9_RDMA_ROUTE_RESOLVED, + P9_RDMA_CONNECTED, + P9_RDMA_FLUSHING, + P9_RDMA_CLOSING, + P9_RDMA_CLOSED, + } state; + struct rdma_cm_id *cm_id; + struct ib_pd *pd; + struct ib_qp *qp; + struct ib_cq *cq; + struct ib_mr *dma_mr; + u32 lkey; + long timeout; + int sq_depth; + struct semaphore sq_sem; + int rq_depth; + atomic_t rq_count; + struct sockaddr_in addr; + spinlock_t req_lock; + + struct completion cm_done; +}; + +/** + * p9_rdma_context - Keeps track of in-process WR + * + * @wc_op: The original WR op for when the CQE completes in error. + * @busa: Bus address to unmap when the WR completes + * @req: Keeps track of requests (send) + * @rc: Keepts track of replies (receive) + */ +struct p9_rdma_req; +struct p9_rdma_context { + enum ib_wc_opcode wc_op; + dma_addr_t busa; + union { + struct p9_req_t *req; + struct p9_fcall *rc; + }; +}; + +/** + * p9_rdma_opts - Collection of mount options + * @port: port of connection + * @sq_depth: The requested depth of the SQ. This really doesn't need + * to be any deeper than the number of threads used in the client + * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth + * @timeout: Time to wait in msecs for CM events + */ +struct p9_rdma_opts { + short port; + int sq_depth; + int rq_depth; + long timeout; +}; + +/* + * Option Parsing (code inspired by NFS code) + */ +enum { + /* Options that take integer arguments */ + Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, +}; + +static match_table_t tokens = { + {Opt_port, "port=%u"}, + {Opt_sq_depth, "sq=%u"}, + {Opt_rq_depth, "rq=%u"}, + {Opt_timeout, "timeout=%u"}, + {Opt_err, NULL}, +}; + +/** + * parse_options - parse mount options into session structure + * @options: options string passed from mount + * @opts: transport-specific structure to parse options into + * + * Returns 0 upon success, -ERRNO upon failure + */ +static int parse_opts(char *params, struct p9_rdma_opts *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char *options; + int ret; + + opts->port = P9_PORT; + opts->sq_depth = P9_RDMA_SQ_DEPTH; + opts->rq_depth = P9_RDMA_RQ_DEPTH; + opts->timeout = P9_RDMA_TIMEOUT; + + if (!params) + return 0; + + options = kstrdup(params, GFP_KERNEL); + if (!options) { + P9_DPRINTK(P9_DEBUG_ERROR, + "failed to allocate copy of option string\n"); + return -ENOMEM; + } + + while ((p = strsep(&options, ",")) != NULL) { + int token; + int r; + if (!*p) + continue; + token = match_token(p, tokens, args); + r = match_int(&args[0], &option); + if (r < 0) { + P9_DPRINTK(P9_DEBUG_ERROR, + "integer field, but no integer?\n"); + ret = r; + continue; + } + switch (token) { + case Opt_port: + opts->port = option; + break; + case Opt_sq_depth: + opts->sq_depth = option; + break; + case Opt_rq_depth: + opts->rq_depth = option; + break; + case Opt_timeout: + opts->timeout = option; + break; + default: + continue; + } + } + /* RQ must be at least as large as the SQ */ + opts->rq_depth = max(opts->rq_depth, opts->sq_depth); + kfree(options); + return 0; +} + +static int +p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) +{ + struct p9_client *c = id->context; + struct p9_trans_rdma *rdma = c->trans; + switch (event->event) { + case RDMA_CM_EVENT_ADDR_RESOLVED: + BUG_ON(rdma->state != P9_RDMA_INIT); + rdma->state = P9_RDMA_ADDR_RESOLVED; + break; + + case RDMA_CM_EVENT_ROUTE_RESOLVED: + BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); + rdma->state = P9_RDMA_ROUTE_RESOLVED; + break; + + case RDMA_CM_EVENT_ESTABLISHED: + BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); + rdma->state = P9_RDMA_CONNECTED; + break; + + case RDMA_CM_EVENT_DISCONNECTED: + if (rdma) + rdma->state = P9_RDMA_CLOSED; + if (c) + c->status = Disconnected; + break; + + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + break; + + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_MULTICAST_ERROR: + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_CONNECT_REQUEST: + case RDMA_CM_EVENT_CONNECT_RESPONSE: + case RDMA_CM_EVENT_CONNECT_ERROR: + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_UNREACHABLE: + c->status = Disconnected; + rdma_disconnect(rdma->cm_id); + break; + default: + BUG(); + } + complete(&rdma->cm_done); + return 0; +} + +static void +handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, + struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +{ + struct p9_req_t *req; + int err = 0; + int16_t tag; + + req = NULL; + ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, + DMA_FROM_DEVICE); + + if (status != IB_WC_SUCCESS) + goto err_out; + + err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); + if (err) + goto err_out; + + req = p9_tag_lookup(client, tag); + if (!req) + goto err_out; + + req->rc = c->rc; + p9_client_cb(client, req); + + return; + + err_out: + P9_DPRINTK(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, status); + rdma->state = P9_RDMA_FLUSHING; + client->status = Disconnected; + return; +} + +static void +handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, + struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +{ + ib_dma_unmap_single(rdma->cm_id->device, + c->busa, c->req->tc->size, + DMA_TO_DEVICE); +} + +static void qp_event_handler(struct ib_event *event, void *context) +{ + P9_DPRINTK(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, + context); +} + +static void cq_comp_handler(struct ib_cq *cq, void *cq_context) +{ + struct p9_client *client = cq_context; + struct p9_trans_rdma *rdma = client->trans; + int ret; + struct ib_wc wc; + + ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); + while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { + struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; + + switch (c->wc_op) { + case IB_WC_RECV: + atomic_dec(&rdma->rq_count); + handle_recv(client, rdma, c, wc.status, wc.byte_len); + break; + + case IB_WC_SEND: + handle_send(client, rdma, c, wc.status, wc.byte_len); + up(&rdma->sq_sem); + break; + + default: + printk(KERN_ERR "9prdma: unexpected completion type, " + "c->wc_op=%d, wc.opcode=%d, status=%d\n", + c->wc_op, wc.opcode, wc.status); + break; + } + kfree(c); + } +} + +static void cq_event_handler(struct ib_event *e, void *v) +{ + P9_DPRINTK(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); +} + +static void rdma_destroy_trans(struct p9_trans_rdma *rdma) +{ + if (!rdma) + return; + + if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) + ib_dereg_mr(rdma->dma_mr); + + if (rdma->qp && !IS_ERR(rdma->qp)) + ib_destroy_qp(rdma->qp); + + if (rdma->pd && !IS_ERR(rdma->pd)) + ib_dealloc_pd(rdma->pd); + + if (rdma->cq && !IS_ERR(rdma->cq)) + ib_destroy_cq(rdma->cq); + + if (rdma->cm_id && !IS_ERR(rdma->cm_id)) + rdma_destroy_id(rdma->cm_id); + + kfree(rdma); +} + +static int +post_recv(struct p9_client *client, struct p9_rdma_context *c) +{ + struct p9_trans_rdma *rdma = client->trans; + struct ib_recv_wr wr, *bad_wr; + struct ib_sge sge; + + c->busa = ib_dma_map_single(rdma->cm_id->device, + c->rc->sdata, client->msize, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) + goto error; + + sge.addr = c->busa; + sge.length = client->msize; + sge.lkey = rdma->lkey; + + wr.next = NULL; + c->wc_op = IB_WC_RECV; + wr.wr_id = (unsigned long) c; + wr.sg_list = &sge; + wr.num_sge = 1; + return ib_post_recv(rdma->qp, &wr, &bad_wr); + + error: + P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); + return -EIO; +} + +static int rdma_request(struct p9_client *client, struct p9_req_t *req) +{ + struct p9_trans_rdma *rdma = client->trans; + struct ib_send_wr wr, *bad_wr; + struct ib_sge sge; + int err = 0; + unsigned long flags; + struct p9_rdma_context *c = NULL; + struct p9_rdma_context *rpl_context = NULL; + + /* Allocate an fcall for the reply */ + rpl_context = kmalloc(sizeof *rpl_context, GFP_KERNEL); + if (!rpl_context) + goto err_close; + + /* + * If the request has a buffer, steal it, otherwise + * allocate a new one. Typically, requests should already + * have receive buffers allocated and just swap them around + */ + if (!req->rc) { + req->rc = kmalloc(sizeof(struct p9_fcall)+client->msize, + GFP_KERNEL); + if (req->rc) { + req->rc->sdata = (char *) req->rc + + sizeof(struct p9_fcall); + req->rc->capacity = client->msize; + } + } + rpl_context->rc = req->rc; + if (!rpl_context->rc) { + kfree(rpl_context); + goto err_close; + } + + /* + * Post a receive buffer for this request. We need to ensure + * there is a reply buffer available for every outstanding + * request. A flushed request can result in no reply for an + * outstanding request, so we must keep a count to avoid + * overflowing the RQ. + */ + if (atomic_inc_return(&rdma->rq_count) <= rdma->rq_depth) { + err = post_recv(client, rpl_context); + if (err) { + kfree(rpl_context->rc); + kfree(rpl_context); + goto err_close; + } + } else + atomic_dec(&rdma->rq_count); + + /* remove posted receive buffer from request structure */ + req->rc = NULL; + + /* Post the request */ + c = kmalloc(sizeof *c, GFP_KERNEL); + if (!c) + goto err_close; + c->req = req; + + c->busa = ib_dma_map_single(rdma->cm_id->device, + c->req->tc->sdata, c->req->tc->size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) + goto error; + + sge.addr = c->busa; + sge.length = c->req->tc->size; + sge.lkey = rdma->lkey; + + wr.next = NULL; + c->wc_op = IB_WC_SEND; + wr.wr_id = (unsigned long) c; + wr.opcode = IB_WR_SEND; + wr.send_flags = IB_SEND_SIGNALED; + wr.sg_list = &sge; + wr.num_sge = 1; + + if (down_interruptible(&rdma->sq_sem)) + goto error; + + return ib_post_send(rdma->qp, &wr, &bad_wr); + + error: + P9_DPRINTK(P9_DEBUG_ERROR, "EIO\n"); + return -EIO; + + err_close: + spin_lock_irqsave(&rdma->req_lock, flags); + if (rdma->state < P9_RDMA_CLOSING) { + rdma->state = P9_RDMA_CLOSING; + spin_unlock_irqrestore(&rdma->req_lock, flags); + rdma_disconnect(rdma->cm_id); + } else + spin_unlock_irqrestore(&rdma->req_lock, flags); + return err; +} + +static void rdma_close(struct p9_client *client) +{ + struct p9_trans_rdma *rdma; + + if (!client) + return; + + rdma = client->trans; + if (!rdma) + return; + + client->status = Disconnected; + rdma_disconnect(rdma->cm_id); + rdma_destroy_trans(rdma); +} + +/** + * alloc_rdma - Allocate and initialize the rdma transport structure + * @msize: MTU + * @dotu: Extension attribute + * @opts: Mount options structure + */ +static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) +{ + struct p9_trans_rdma *rdma; + + rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); + if (!rdma) + return NULL; + + rdma->sq_depth = opts->sq_depth; + rdma->rq_depth = opts->rq_depth; + rdma->timeout = opts->timeout; + spin_lock_init(&rdma->req_lock); + init_completion(&rdma->cm_done); + sema_init(&rdma->sq_sem, rdma->sq_depth); + atomic_set(&rdma->rq_count, 0); + + return rdma; +} + +/* its not clear to me we can do anything after send has been posted */ +static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +/** + * trans_create_rdma - Transport method for creating atransport instance + * @client: client instance + * @addr: IP address string + * @args: Mount options string + */ +static int +rdma_create_trans(struct p9_client *client, const char *addr, char *args) +{ + int err; + struct p9_rdma_opts opts; + struct p9_trans_rdma *rdma; + struct rdma_conn_param conn_param; + struct ib_qp_init_attr qp_attr; + struct ib_device_attr devattr; + + /* Parse the transport specific mount options */ + err = parse_opts(args, &opts); + if (err < 0) + return err; + + /* Create and initialize the RDMA transport structure */ + rdma = alloc_rdma(&opts); + if (!rdma) + return -ENOMEM; + + /* Create the RDMA CM ID */ + rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP); + if (IS_ERR(rdma->cm_id)) + goto error; + + /* Resolve the server's address */ + rdma->addr.sin_family = AF_INET; + rdma->addr.sin_addr.s_addr = in_aton(addr); + rdma->addr.sin_port = htons(opts.port); + err = rdma_resolve_addr(rdma->cm_id, NULL, + (struct sockaddr *)&rdma->addr, + rdma->timeout); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) + goto error; + + /* Resolve the route to the server */ + err = rdma_resolve_route(rdma->cm_id, rdma->timeout); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) + goto error; + + /* Query the device attributes */ + err = ib_query_device(rdma->cm_id->device, &devattr); + if (err) + goto error; + + /* Create the Completion Queue */ + rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, + cq_event_handler, client, + opts.sq_depth + opts.rq_depth + 1, 0); + if (IS_ERR(rdma->cq)) + goto error; + ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); + + /* Create the Protection Domain */ + rdma->pd = ib_alloc_pd(rdma->cm_id->device); + if (IS_ERR(rdma->pd)) + goto error; + + /* Cache the DMA lkey in the transport */ + rdma->dma_mr = NULL; + if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + rdma->lkey = rdma->cm_id->device->local_dma_lkey; + else { + rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rdma->dma_mr)) + goto error; + rdma->lkey = rdma->dma_mr->lkey; + } + + /* Create the Queue Pair */ + memset(&qp_attr, 0, sizeof qp_attr); + qp_attr.event_handler = qp_event_handler; + qp_attr.qp_context = client; + qp_attr.cap.max_send_wr = opts.sq_depth; + qp_attr.cap.max_recv_wr = opts.rq_depth; + qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; + qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = rdma->cq; + qp_attr.recv_cq = rdma->cq; + err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); + if (err) + goto error; + rdma->qp = rdma->cm_id->qp; + + /* Request a connection */ + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + conn_param.responder_resources = P9_RDMA_IRD; + conn_param.initiator_depth = P9_RDMA_ORD; + err = rdma_connect(rdma->cm_id, &conn_param); + if (err) + goto error; + err = wait_for_completion_interruptible(&rdma->cm_done); + if (err || (rdma->state != P9_RDMA_CONNECTED)) + goto error; + + client->trans = rdma; + client->status = Connected; + + return 0; + +error: + rdma_destroy_trans(rdma); + return -ENOTCONN; +} + +static struct p9_trans_module p9_rdma_trans = { + .name = "rdma", + .maxsize = P9_RDMA_MAXSIZE, + .def = 0, + .owner = THIS_MODULE, + .create = rdma_create_trans, + .close = rdma_close, + .request = rdma_request, + .cancel = rdma_cancel, +}; + +/** + * p9_trans_rdma_init - Register the 9P RDMA transport driver + */ +static int __init p9_trans_rdma_init(void) +{ + v9fs_register_trans(&p9_rdma_trans); + return 0; +} + +static void __exit p9_trans_rdma_exit(void) +{ + v9fs_unregister_trans(&p9_rdma_trans); +} + +module_init(p9_trans_rdma_init); +module_exit(p9_trans_rdma_exit); + +MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); +MODULE_DESCRIPTION("RDMA Transport for 9P"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 22ba8632196f..6c023f0f8252 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -179,5 +179,5 @@ void br_dev_setup(struct net_device *dev) dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX | - NETIF_F_NETNS_LOCAL; + NETIF_F_NETNS_LOCAL | NETIF_F_GSO; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 573e20f7dba4..0a09ccf68c1c 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -347,15 +347,21 @@ int br_min_mtu(const struct net_bridge *br) void br_features_recompute(struct net_bridge *br) { struct net_bridge_port *p; - unsigned long features; + unsigned long features, mask; - features = br->feature_mask; + features = mask = br->feature_mask; + if (list_empty(&br->port_list)) + goto done; + + features &= ~NETIF_F_ONE_FOR_ALL; list_for_each_entry(p, &br->port_list, list) { - features = netdev_compute_features(features, p->dev->features); + features = netdev_increment_features(features, + p->dev->features, mask); } - br->dev->features = features; +done: + br->dev->features = netdev_fix_features(features, NULL); } /* called with RTNL */ diff --git a/net/core/dev.c b/net/core/dev.c index b8a4fd0806af..d9038e328cc1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3947,6 +3947,46 @@ static void netdev_init_queue_locks(struct net_device *dev) __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); } +unsigned long netdev_fix_features(unsigned long features, const char *name) +{ + /* Fix illegal SG+CSUM combinations. */ + if ((features & NETIF_F_SG) && + !(features & NETIF_F_ALL_CSUM)) { + if (name) + printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " + "checksum feature.\n", name); + features &= ~NETIF_F_SG; + } + + /* TSO requires that SG is present as well. */ + if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { + if (name) + printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no " + "SG feature.\n", name); + features &= ~NETIF_F_TSO; + } + + if (features & NETIF_F_UFO) { + if (!(features & NETIF_F_GEN_CSUM)) { + if (name) + printk(KERN_ERR "%s: Dropping NETIF_F_UFO " + "since no NETIF_F_HW_CSUM feature.\n", + name); + features &= ~NETIF_F_UFO; + } + + if (!(features & NETIF_F_SG)) { + if (name) + printk(KERN_ERR "%s: Dropping NETIF_F_UFO " + "since no NETIF_F_SG feature.\n", name); + features &= ~NETIF_F_UFO; + } + } + + return features; +} +EXPORT_SYMBOL(netdev_fix_features); + /** * register_netdevice - register a network device * @dev: device to register @@ -4032,36 +4072,7 @@ int register_netdevice(struct net_device *dev) dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM); } - - /* Fix illegal SG+CSUM combinations. */ - if ((dev->features & NETIF_F_SG) && - !(dev->features & NETIF_F_ALL_CSUM)) { - printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no checksum feature.\n", - dev->name); - dev->features &= ~NETIF_F_SG; - } - - /* TSO requires that SG is present as well. */ - if ((dev->features & NETIF_F_TSO) && - !(dev->features & NETIF_F_SG)) { - printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no SG feature.\n", - dev->name); - dev->features &= ~NETIF_F_TSO; - } - if (dev->features & NETIF_F_UFO) { - if (!(dev->features & NETIF_F_HW_CSUM)) { - printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " - "NETIF_F_HW_CSUM feature.\n", - dev->name); - dev->features &= ~NETIF_F_UFO; - } - if (!(dev->features & NETIF_F_SG)) { - printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no " - "NETIF_F_SG feature.\n", - dev->name); - dev->features &= ~NETIF_F_UFO; - } - } + dev->features = netdev_fix_features(dev->features, dev->name); /* Enable software GSO if SG is supported. */ if (dev->features & NETIF_F_SG) @@ -4700,49 +4711,45 @@ static int __init netdev_dma_register(void) { return -ENODEV; } #endif /* CONFIG_NET_DMA */ /** - * netdev_compute_feature - compute conjunction of two feature sets - * @all: first feature set - * @one: second feature set + * netdev_increment_features - increment feature set by one + * @all: current feature set + * @one: new feature set + * @mask: mask feature set * * Computes a new feature set after adding a device with feature set - * @one to the master device with current feature set @all. Returns - * the new feature set. + * @one to the master device with current feature set @all. Will not + * enable anything that is off in @mask. Returns the new feature set. */ -int netdev_compute_features(unsigned long all, unsigned long one) -{ - /* if device needs checksumming, downgrade to hw checksumming */ - if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) - all ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; - - /* if device can't do all checksum, downgrade to ipv4/ipv6 */ - if (all & NETIF_F_HW_CSUM && !(one & NETIF_F_HW_CSUM)) - all ^= NETIF_F_HW_CSUM - | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; - - if (one & NETIF_F_GSO) - one |= NETIF_F_GSO_SOFTWARE; - one |= NETIF_F_GSO; - - /* - * If even one device supports a GSO protocol with software fallback, - * enable it for all. - */ - all |= one & NETIF_F_GSO_SOFTWARE; +unsigned long netdev_increment_features(unsigned long all, unsigned long one, + unsigned long mask) +{ + /* If device needs checksumming, downgrade to it. */ + if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) + all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM); + else if (mask & NETIF_F_ALL_CSUM) { + /* If one device supports v4/v6 checksumming, set for all. */ + if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) && + !(all & NETIF_F_GEN_CSUM)) { + all &= ~NETIF_F_ALL_CSUM; + all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); + } - /* If even one device supports robust GSO, enable it for all. */ - if (one & NETIF_F_GSO_ROBUST) - all |= NETIF_F_GSO_ROBUST; + /* If one device supports hw checksumming, set for all. */ + if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) { + all &= ~NETIF_F_ALL_CSUM; + all |= NETIF_F_HW_CSUM; + } + } - all &= one | NETIF_F_LLTX; + one |= NETIF_F_ALL_CSUM; - if (!(all & NETIF_F_ALL_CSUM)) - all &= ~NETIF_F_SG; - if (!(all & NETIF_F_SG)) - all &= ~NETIF_F_GSO_MASK; + one |= all & NETIF_F_ONE_FOR_ALL; + all &= one | NETIF_F_LLTX | NETIF_F_GSO; + all |= one & mask & NETIF_F_ONE_FOR_ALL; return all; } -EXPORT_SYMBOL(netdev_compute_features); +EXPORT_SYMBOL(netdev_increment_features); static struct hlist_head *netdev_create_hash(void) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 990a58493235..e4c5ac9fe89b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -362,6 +362,17 @@ struct tcp_out_options { __u32 tsval, tsecr; /* need to include OPTION_TS */ }; +/* Beware: Something in the Internet is very sensitive to the ordering of + * TCP options, we learned this through the hard way, so be careful here. + * Luckily we can at least blame others for their non-compliance but from + * inter-operatibility perspective it seems that we're somewhat stuck with + * the ordering which we have been using if we want to keep working with + * those broken things (not that it currently hurts anybody as there isn't + * particular reason why the ordering would need to be changed). + * + * At least SACK_PERM as the first option is known to lead to a disaster + * (but it may well be that other scenarios fail similarly). + */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, const struct tcp_out_options *opts, __u8 **md5_hash) { @@ -376,6 +387,12 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *md5_hash = NULL; } + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } + if (likely(OPTION_TS & opts->options)) { if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | @@ -392,12 +409,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - if (unlikely(opts->mss)) { - *ptr++ = htonl((TCPOPT_MSS << 24) | - (TCPOLEN_MSS << 16) | - opts->mss); - } - if (unlikely(OPTION_SACK_ADVERTISE & opts->options && !(OPTION_TS & opts->options))) { *ptr++ = htonl((TCPOPT_NOP << 24) | @@ -432,7 +443,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; } } } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 8b06fa900482..03e389e8d945 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -545,9 +545,10 @@ static void cbq_ovl_delay(struct cbq_class *cl) expires = ktime_set(0, 0); expires = ktime_add_ns(expires, PSCHED_US2NS(sched)); if (hrtimer_try_to_cancel(&q->delay_timer) && - ktime_to_ns(ktime_sub(q->delay_timer.expires, - expires)) > 0) - q->delay_timer.expires = expires; + ktime_to_ns(ktime_sub( + hrtimer_get_expires(&q->delay_timer), + expires)) > 0) + hrtimer_set_expires(&q->delay_timer, expires); hrtimer_restart(&q->delay_timer); cl->delayed = 1; cl->xstats.overactions++; diff --git a/net/sctp/input.c b/net/sctp/input.c index a49fa80b57b9..bf612d954d41 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -369,7 +369,7 @@ static void sctp_add_backlog(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (!t || (t->pathmtu == pmtu)) + if (!t || (t->pathmtu <= pmtu)) return; if (sock_owned_by_user(sk)) { diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d4c3fbc4671e..a6a0ea71ae93 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -2544,6 +2544,7 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep, sctp_shutdownhdr_t *sdh; sctp_disposition_t disposition; struct sctp_ulpevent *ev; + __u32 ctsn; if (!sctp_vtag_verify(chunk, asoc)) return sctp_sf_pdiscard(ep, asoc, type, arg, commands); @@ -2558,6 +2559,14 @@ sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep, sdh = (sctp_shutdownhdr_t *)chunk->skb->data; skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t)); chunk->subh.shutdown_hdr = sdh; + ctsn = ntohl(sdh->cum_tsn_ack); + + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (!TSN_lt(ctsn, asoc->next_tsn)) + return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands); /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT * When a peer sends a SHUTDOWN, SCTP delivers this notification to @@ -2599,6 +2608,51 @@ out: return disposition; } +/* + * sctp_sf_do_9_2_shut_ctsn + * + * Once an endpoint has reached the SHUTDOWN-RECEIVED state, + * it MUST NOT send a SHUTDOWN in response to a ULP request. + * The Cumulative TSN Ack of the received SHUTDOWN chunk + * MUST be processed. + */ +sctp_disposition_t sctp_sf_do_9_2_shut_ctsn(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_shutdownhdr_t *sdh; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, + sizeof(struct sctp_shutdown_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + sdh = (sctp_shutdownhdr_t *)chunk->skb->data; + + /* If Cumulative TSN Ack beyond the max tsn currently + * send, terminating the association and respond to the + * sender with an ABORT. + */ + if (!TSN_lt(ntohl(sdh->cum_tsn_ack), asoc->next_tsn)) + return sctp_sf_violation_ctsn(ep, asoc, type, arg, commands); + + /* verify, by checking the Cumulative TSN Ack field of the + * chunk, that all its outstanding DATA chunks have been + * received by the SHUTDOWN sender. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN, + SCTP_BE32(sdh->cum_tsn_ack)); + + return SCTP_DISPOSITION_CONSUME; +} + /* RFC 2960 9.2 * If an endpoint is in SHUTDOWN-ACK-SENT state and receives an INIT chunk * (e.g., if the SHUTDOWN COMPLETE was lost) with source and destination diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index dd4ddc40c0ad..5c8186d88c61 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -266,11 +266,11 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, /* SCTP_STATE_ESTABLISHED */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_PENDING */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown), \ /* SCTP_STATE_SHUTDOWN_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_do_9_2_shutdown_ack), \ /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ - TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ + TYPE_SCTP_FUNC(sctp_sf_do_9_2_shut_ctsn), \ /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ TYPE_SCTP_FUNC(sctp_sf_discard_chunk), \ } /* TYPE_SCTP_SHUTDOWN */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c647aab8d418..dc504d308ec0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -711,28 +711,30 @@ static struct sock *unix_find_other(struct net *net, int type, unsigned hash, int *error) { struct sock *u; - struct nameidata nd; + struct path path; int err = 0; if (sunname->sun_path[0]) { - err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); + struct inode *inode; + err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; - err = vfs_permission(&nd, MAY_WRITE); + inode = path.dentry->d_inode; + err = inode_permission(inode, MAY_WRITE); if (err) goto put_fail; err = -ECONNREFUSED; - if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) + if (!S_ISSOCK(inode->i_mode)) goto put_fail; - u = unix_find_socket_byinode(net, nd.path.dentry->d_inode); + u = unix_find_socket_byinode(net, inode); if (!u) goto put_fail; if (u->sk_type == type) - touch_atime(nd.path.mnt, nd.path.dentry); + touch_atime(path.mnt, path.dentry); - path_put(&nd.path); + path_put(&path); err=-EPROTOTYPE; if (u->sk_type != type) { @@ -753,7 +755,7 @@ static struct sock *unix_find_other(struct net *net, return u; put_fail: - path_put(&nd.path); + path_put(&path); fail: *error=err; return NULL; diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig index 7d82be07fa1d..646c7121dbc0 100644 --- a/net/wireless/Kconfig +++ b/net/wireless/Kconfig @@ -16,7 +16,7 @@ config NL80211 config WIRELESS_OLD_REGULATORY bool "Old wireless static regulatory definitions" - default n + default y ---help--- This option enables the old static regulatory information and uses it within the new framework. This is available @@ -40,11 +40,10 @@ config WIRELESS_OLD_REGULATORY ieee80211_regdom module parameter. This is being phased out and you should stop using them ASAP. - Say N unless you cannot install a new userspace application - or have one currently depending on the ieee80211_regdom module - parameter and cannot port it to use the new userspace interfaces. - - This is scheduled for removal for 2.6.29. + Say Y unless you have installed a new userspace application. + Also say Y if have one currently depending on the ieee80211_regdom + module parameter and cannot port it to use the new userspace + interfaces. config WIRELESS_EXT bool "Wireless extensions" |