diff options
-rw-r--r-- | drivers/vhost/net.c | 159 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 55 |
2 files changed, 64 insertions, 150 deletions
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index f616cefc95ba..2f7c76a85e53 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -60,6 +60,7 @@ static int move_iovec_hdr(struct iovec *from, struct iovec *to, { int seg = 0; size_t size; + while (len && seg < iov_count) { size = min(from->iov_len, len); to->iov_base = from->iov_base; @@ -79,6 +80,7 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to, { int seg = 0; size_t size; + while (len && seg < iovcount) { size = min(from->iov_len, len); to->iov_base = from->iov_base; @@ -211,12 +213,13 @@ static int peek_head_len(struct sock *sk) { struct sk_buff *head; int len = 0; + unsigned long flags; - lock_sock(sk); + spin_lock_irqsave(&sk->sk_receive_queue.lock, flags); head = skb_peek(&sk->sk_receive_queue); - if (head) + if (likely(head)) len = head->len; - release_sock(sk); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, flags); return len; } @@ -227,6 +230,7 @@ static int peek_head_len(struct sock *sk) * @iovcount - returned count of io vectors we fill * @log - vhost log * @log_num - log offset + * @quota - headcount quota, 1 for big buffer * returns number of buffer heads allocated, negative on error */ static int get_rx_bufs(struct vhost_virtqueue *vq, @@ -234,7 +238,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, int datalen, unsigned *iovcount, struct vhost_log *log, - unsigned *log_num) + unsigned *log_num, + unsigned int quota) { unsigned int out, in; int seg = 0; @@ -242,7 +247,7 @@ static int get_rx_bufs(struct vhost_virtqueue *vq, unsigned d; int r, nlogs = 0; - while (datalen > 0) { + while (datalen > 0 && headcount < quota) { if (unlikely(seg >= UIO_MAXIOV)) { r = -ENOBUFS; goto err; @@ -282,117 +287,7 @@ err: /* Expects to be always run from workqueue - which acts as * read-size critical section for our kind of RCU. */ -static void handle_rx_big(struct vhost_net *net) -{ - struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; - unsigned out, in, log, s; - int head; - struct vhost_log *vq_log; - struct msghdr msg = { - .msg_name = NULL, - .msg_namelen = 0, - .msg_control = NULL, /* FIXME: get and handle RX aux data. */ - .msg_controllen = 0, - .msg_iov = vq->iov, - .msg_flags = MSG_DONTWAIT, - }; - - struct virtio_net_hdr hdr = { - .flags = 0, - .gso_type = VIRTIO_NET_HDR_GSO_NONE - }; - - size_t len, total_len = 0; - int err; - size_t hdr_size; - /* TODO: check that we are running from vhost_worker? */ - struct socket *sock = rcu_dereference_check(vq->private_data, 1); - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) - return; - - mutex_lock(&vq->mutex); - vhost_disable_notify(vq); - hdr_size = vq->vhost_hlen; - - vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? - vq->log : NULL; - - for (;;) { - head = vhost_get_vq_desc(&net->dev, vq, vq->iov, - ARRAY_SIZE(vq->iov), - &out, &in, - vq_log, &log); - /* On error, stop handling until the next kick. */ - if (unlikely(head < 0)) - break; - /* OK, now we need to know about added descriptors. */ - if (head == vq->num) { - if (unlikely(vhost_enable_notify(vq))) { - /* They have slipped one in as we were - * doing that: check again. */ - vhost_disable_notify(vq); - continue; - } - /* Nothing new? Wait for eventfd to tell us - * they refilled. */ - break; - } - /* We don't need to be notified again. */ - if (out) { - vq_err(vq, "Unexpected descriptor format for RX: " - "out %d, int %d\n", - out, in); - break; - } - /* Skip header. TODO: support TSO/mergeable rx buffers. */ - s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in); - msg.msg_iovlen = in; - len = iov_length(vq->iov, in); - /* Sanity check */ - if (!len) { - vq_err(vq, "Unexpected header len for RX: " - "%zd expected %zd\n", - iov_length(vq->hdr, s), hdr_size); - break; - } - err = sock->ops->recvmsg(NULL, sock, &msg, - len, MSG_DONTWAIT | MSG_TRUNC); - /* TODO: Check specific error and bomb out unless EAGAIN? */ - if (err < 0) { - vhost_discard_vq_desc(vq, 1); - break; - } - /* TODO: Should check and handle checksum. */ - if (err > len) { - pr_debug("Discarded truncated rx packet: " - " len %d > %zd\n", err, len); - vhost_discard_vq_desc(vq, 1); - continue; - } - len = err; - err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size); - if (err) { - vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n", - vq->iov->iov_base, err); - break; - } - len += hdr_size; - vhost_add_used_and_signal(&net->dev, vq, head, len); - if (unlikely(vq_log)) - vhost_log_write(vq, vq_log, log, len); - total_len += len; - if (unlikely(total_len >= VHOST_NET_WEIGHT)) { - vhost_poll_queue(&vq->poll); - break; - } - } - - mutex_unlock(&vq->mutex); -} - -/* Expects to be always run from workqueue - which acts as - * read-size critical section for our kind of RCU. */ -static void handle_rx_mergeable(struct vhost_net *net) +static void handle_rx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; unsigned uninitialized_var(in), log; @@ -405,19 +300,18 @@ static void handle_rx_mergeable(struct vhost_net *net) .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; - struct virtio_net_hdr_mrg_rxbuf hdr = { .hdr.flags = 0, .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE }; - size_t total_len = 0; - int err, headcount; + int err, headcount, mergeable; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; /* TODO: check that we are running from vhost_worker? */ struct socket *sock = rcu_dereference_check(vq->private_data, 1); - if (!sock || skb_queue_empty(&sock->sk->sk_receive_queue)) + + if (!sock) return; mutex_lock(&vq->mutex); @@ -427,12 +321,14 @@ static void handle_rx_mergeable(struct vhost_net *net) vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; + mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF); while ((sock_len = peek_head_len(sock->sk))) { sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads, vhost_len, - &in, vq_log, &log); + &in, vq_log, &log, + likely(mergeable) ? UIO_MAXIOV : 1); /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) break; @@ -476,7 +372,7 @@ static void handle_rx_mergeable(struct vhost_net *net) break; } /* TODO: Should check and handle checksum. */ - if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF) && + if (likely(mergeable) && memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, offsetof(typeof(hdr), num_buffers), sizeof hdr.num_buffers)) { @@ -498,14 +394,6 @@ static void handle_rx_mergeable(struct vhost_net *net) mutex_unlock(&vq->mutex); } -static void handle_rx(struct vhost_net *net) -{ - if (vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF)) - handle_rx_mergeable(net); - else - handle_rx_big(net); -} - static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, @@ -654,6 +542,7 @@ static struct socket *get_raw_socket(int fd) } uaddr; int uaddr_len = sizeof uaddr, r; struct socket *sock = sockfd_lookup(fd, &r); + if (!sock) return ERR_PTR(-ENOTSOCK); @@ -682,6 +571,7 @@ static struct socket *get_tap_socket(int fd) { struct file *file = fget(fd); struct socket *sock; + if (!file) return ERR_PTR(-EBADF); sock = tun_get_socket(file); @@ -696,6 +586,7 @@ static struct socket *get_tap_socket(int fd) static struct socket *get_socket(int fd) { struct socket *sock; + /* special case to disable backend */ if (fd == -1) return NULL; @@ -741,9 +632,9 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) oldsock = rcu_dereference_protected(vq->private_data, lockdep_is_held(&vq->mutex)); if (sock != oldsock) { - vhost_net_disable_vq(n, vq); - rcu_assign_pointer(vq->private_data, sock); - vhost_net_enable_vq(n, vq); + vhost_net_disable_vq(n, vq); + rcu_assign_pointer(vq->private_data, sock); + vhost_net_enable_vq(n, vq); } mutex_unlock(&vq->mutex); @@ -768,6 +659,7 @@ static long vhost_net_reset_owner(struct vhost_net *n) struct socket *tx_sock = NULL; struct socket *rx_sock = NULL; long err; + mutex_lock(&n->dev.mutex); err = vhost_dev_check_owner(&n->dev); if (err) @@ -829,6 +721,7 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl, struct vhost_vring_file backend; u64 features; int r; + switch (ioctl) { case VHOST_NET_SET_BACKEND: if (copy_from_user(&backend, argp, sizeof backend)) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index ade0568c07a4..2ab291241635 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -41,8 +41,8 @@ static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { struct vhost_poll *poll; - poll = container_of(pt, struct vhost_poll, table); + poll = container_of(pt, struct vhost_poll, table); poll->wqh = wqh; add_wait_queue(wqh, &poll->wait); } @@ -85,6 +85,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn, void vhost_poll_start(struct vhost_poll *poll, struct file *file) { unsigned long mask; + mask = file->f_op->poll(file, &poll->table); if (mask) vhost_poll_wakeup(&poll->wait, 0, 0, (void *)mask); @@ -101,6 +102,7 @@ static bool vhost_work_seq_done(struct vhost_dev *dev, struct vhost_work *work, unsigned seq) { int left; + spin_lock_irq(&dev->work_lock); left = seq - work->done_seq; spin_unlock_irq(&dev->work_lock); @@ -222,6 +224,7 @@ static int vhost_worker(void *data) static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->nvqs; ++i) { dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect * UIO_MAXIOV, GFP_KERNEL); @@ -235,6 +238,7 @@ static long vhost_dev_alloc_iovecs(struct vhost_dev *dev) goto err_nomem; } return 0; + err_nomem: for (; i >= 0; --i) { kfree(dev->vqs[i].indirect); @@ -247,6 +251,7 @@ err_nomem: static void vhost_dev_free_iovecs(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->nvqs; ++i) { kfree(dev->vqs[i].indirect); dev->vqs[i].indirect = NULL; @@ -296,26 +301,28 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; + struct vhost_work work; + struct task_struct *owner; + int ret; }; static void vhost_attach_cgroups_work(struct vhost_work *work) { - struct vhost_attach_cgroups_struct *s; - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s->ret = cgroup_attach_task_all(s->owner, current); } static int vhost_attach_cgroups(struct vhost_dev *dev) { - struct vhost_attach_cgroups_struct attach; - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_work_flush(dev, &attach.work); - return attach.ret; + struct vhost_attach_cgroups_struct attach; + + attach.owner = current; + vhost_work_init(&attach.work, vhost_attach_cgroups_work); + vhost_work_queue(dev, &attach.work); + vhost_work_flush(dev, &attach.work); + return attach.ret; } /* Caller should have device mutex */ @@ -323,11 +330,13 @@ static long vhost_dev_set_owner(struct vhost_dev *dev) { struct task_struct *worker; int err; + /* Is there an owner already? */ if (dev->mm) { err = -EBUSY; goto err_mm; } + /* No owner, become one */ dev->mm = get_task_mm(current); worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); @@ -380,6 +389,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev) void vhost_dev_cleanup(struct vhost_dev *dev) { int i; + for (i = 0; i < dev->nvqs; ++i) { if (dev->vqs[i].kick && dev->vqs[i].handle_kick) { vhost_poll_stop(&dev->vqs[i].poll); @@ -421,6 +431,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) static int log_access_ok(void __user *log_base, u64 addr, unsigned long sz) { u64 a = addr / VHOST_PAGE_SIZE / 8; + /* Make sure 64 bit math will not overflow. */ if (a > ULONG_MAX - (unsigned long)log_base || a + (unsigned long)log_base > ULONG_MAX) @@ -461,6 +472,7 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, int log_all) { int i; + for (i = 0; i < d->nvqs; ++i) { int ok; mutex_lock(&d->vqs[i].mutex); @@ -527,6 +539,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem, *oldmem; unsigned long size = offsetof(struct vhost_memory, regions); + if (copy_from_user(&mem, m, size)) return -EFAULT; if (mem.padding) @@ -544,7 +557,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return -EFAULT; } - if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) { + if (!memory_access_ok(d, newmem, + vhost_has_feature(d, VHOST_F_LOG_ALL))) { kfree(newmem); return -EFAULT; } @@ -560,6 +574,7 @@ static int init_used(struct vhost_virtqueue *vq, struct vring_used __user *used) { int r = put_user(vq->used_flags, &used->flags); + if (r) return r; return get_user(vq->last_used_idx, &used->idx); @@ -849,6 +864,7 @@ static const struct vhost_memory_region *find_region(struct vhost_memory *mem, { struct vhost_memory_region *reg; int i; + /* linear search is not brilliant, but we really have on the order of 6 * regions in practice */ for (i = 0; i < mem->nregions; ++i) { @@ -871,6 +887,7 @@ static int set_bit_to_user(int nr, void __user *addr) void *base; int bit = nr + (log % PAGE_SIZE) * 8; int r; + r = get_user_pages_fast(log, 1, 1, &page); if (r < 0) return r; @@ -888,6 +905,7 @@ static int log_write(void __user *log_base, { u64 write_page = write_address / VHOST_PAGE_SIZE; int r; + if (!write_length) return 0; write_length += write_address % VHOST_PAGE_SIZE; @@ -1037,8 +1055,8 @@ static int get_indirect(struct vhost_dev *dev, struct vhost_virtqueue *vq, i, count); return -EINVAL; } - if (unlikely(memcpy_fromiovec((unsigned char *)&desc, vq->indirect, - sizeof desc))) { + if (unlikely(memcpy_fromiovec((unsigned char *)&desc, + vq->indirect, sizeof desc))) { vq_err(vq, "Failed indirect descriptor: idx %d, %zx\n", i, (size_t)indirect->addr + i * sizeof desc); return -EINVAL; @@ -1153,7 +1171,7 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, i, vq->num, head); return -EINVAL; } - ret = copy_from_user(&desc, vq->desc + i, sizeof desc); + ret = __copy_from_user(&desc, vq->desc + i, sizeof desc); if (unlikely(ret)) { vq_err(vq, "Failed to get descriptor: idx %d addr %p\n", i, vq->desc + i); @@ -1317,6 +1335,7 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) { __u16 flags; + /* Flush out used index updates. This is paired * with the barrier that the Guest executes when enabling * interrupts. */ @@ -1361,6 +1380,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) { u16 avail_idx; int r; + if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) return false; vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; @@ -1387,6 +1407,7 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) void vhost_disable_notify(struct vhost_virtqueue *vq) { int r; + if (vq->used_flags & VRING_USED_F_NO_NOTIFY) return; vq->used_flags |= VRING_USED_F_NO_NOTIFY; |