diff options
Diffstat (limited to 'drivers/vhost')
-rw-r--r-- | drivers/vhost/Kconfig | 5 | ||||
-rw-r--r-- | drivers/vhost/scsi.c | 174 | ||||
-rw-r--r-- | drivers/vhost/vdpa.c | 46 | ||||
-rw-r--r-- | drivers/vhost/vhost.c | 135 | ||||
-rw-r--r-- | drivers/vhost/vhost.h | 11 | ||||
-rw-r--r-- | drivers/vhost/vringh.c | 191 | ||||
-rw-r--r-- | drivers/vhost/vsock.c | 1 |
7 files changed, 330 insertions, 233 deletions
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig index 587fbae06182..b455d9ab6f3d 100644 --- a/drivers/vhost/Kconfig +++ b/drivers/vhost/Kconfig @@ -13,9 +13,14 @@ config VHOST_RING This option is selected by any driver which needs to access the host side of a virtio ring. +config VHOST_TASK + bool + default n + config VHOST tristate select VHOST_IOTLB + select VHOST_TASK help This option is selected by any driver which needs to access the core of vhost. diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index b244e7c0f514..bb10fa4bb4f6 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -125,7 +125,6 @@ struct vhost_scsi_tpg { struct se_portal_group se_tpg; /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */ struct vhost_scsi *vhost_scsi; - struct list_head tmf_queue; }; struct vhost_scsi_tport { @@ -206,10 +205,8 @@ struct vhost_scsi { struct vhost_scsi_tmf { struct vhost_work vwork; - struct vhost_scsi_tpg *tpg; struct vhost_scsi *vhost; struct vhost_scsi_virtqueue *svq; - struct list_head queue_entry; struct se_cmd se_cmd; u8 scsi_resp; @@ -232,7 +229,10 @@ struct vhost_scsi_ctx { struct iov_iter out_iter; }; -/* Global spinlock to protect vhost_scsi TPG list for vhost IOCTL access */ +/* + * Global mutex to protect vhost_scsi TPG list for vhost IOCTLs and LIO + * configfs management operations. + */ static DEFINE_MUTEX(vhost_scsi_mutex); static LIST_HEAD(vhost_scsi_list); @@ -294,11 +294,6 @@ static int vhost_scsi_check_true(struct se_portal_group *se_tpg) return 1; } -static int vhost_scsi_check_false(struct se_portal_group *se_tpg) -{ - return 0; -} - static char *vhost_scsi_get_fabric_wwn(struct se_portal_group *se_tpg) { struct vhost_scsi_tpg *tpg = container_of(se_tpg, @@ -323,11 +318,6 @@ static int vhost_scsi_check_prot_fabric_only(struct se_portal_group *se_tpg) return tpg->tv_fabric_prot_type; } -static u32 vhost_scsi_tpg_get_inst_index(struct se_portal_group *se_tpg) -{ - return 1; -} - static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) { struct vhost_scsi_cmd *tv_cmd = container_of(se_cmd, @@ -352,12 +342,9 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) static void vhost_scsi_release_tmf_res(struct vhost_scsi_tmf *tmf) { - struct vhost_scsi_tpg *tpg = tmf->tpg; struct vhost_scsi_inflight *inflight = tmf->inflight; - mutex_lock(&tpg->tv_tpg_mutex); - list_add_tail(&tpg->tmf_queue, &tmf->queue_entry); - mutex_unlock(&tpg->tv_tpg_mutex); + kfree(tmf); vhost_scsi_put_inflight(inflight); } @@ -378,11 +365,6 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd) } } -static u32 vhost_scsi_sess_get_index(struct se_session *se_sess) -{ - return 0; -} - static int vhost_scsi_write_pending(struct se_cmd *se_cmd) { /* Go ahead and process the write immediately */ @@ -390,16 +372,6 @@ static int vhost_scsi_write_pending(struct se_cmd *se_cmd) return 0; } -static void vhost_scsi_set_default_node_attrs(struct se_node_acl *nacl) -{ - return; -} - -static int vhost_scsi_get_cmd_state(struct se_cmd *se_cmd) -{ - return 0; -} - static int vhost_scsi_queue_data_in(struct se_cmd *se_cmd) { transport_generic_free_cmd(se_cmd, 0); @@ -671,7 +643,7 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) { int sgl_count = 0; - if (!iter || !iter->iov) { + if (!iter || !iter_iov(iter)) { pr_err("%s: iter->iov is NULL, but expected bytes: %zu" " present\n", __func__, bytes); return -EINVAL; @@ -1194,19 +1166,11 @@ vhost_scsi_handle_tmf(struct vhost_scsi *vs, struct vhost_scsi_tpg *tpg, goto send_reject; } - mutex_lock(&tpg->tv_tpg_mutex); - if (list_empty(&tpg->tmf_queue)) { - pr_err("Missing reserve TMF. Could not handle LUN RESET.\n"); - mutex_unlock(&tpg->tv_tpg_mutex); + tmf = kzalloc(sizeof(*tmf), GFP_KERNEL); + if (!tmf) goto send_reject; - } - - tmf = list_first_entry(&tpg->tmf_queue, struct vhost_scsi_tmf, - queue_entry); - list_del_init(&tmf->queue_entry); - mutex_unlock(&tpg->tv_tpg_mutex); - tmf->tpg = tpg; + vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work); tmf->vhost = vs; tmf->svq = svq; tmf->resp_iov = vq->iov[vc->out]; @@ -1540,7 +1504,7 @@ out: * vhost_scsi_tpg with an active struct vhost_scsi_nexus * * The lock nesting rule is: - * vhost_scsi_mutex -> vs->dev.mutex -> tpg->tv_tpg_mutex -> vq->mutex + * vs->dev.mutex -> vhost_scsi_mutex -> tpg->tv_tpg_mutex -> vq->mutex */ static int vhost_scsi_set_endpoint(struct vhost_scsi *vs, @@ -1554,7 +1518,6 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, int index, ret, i, len; bool match = false; - mutex_lock(&vhost_scsi_mutex); mutex_lock(&vs->dev.mutex); /* Verify that ring has been setup correctly. */ @@ -1575,6 +1538,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (vs->vs_tpg) memcpy(vs_tpg, vs->vs_tpg, len); + mutex_lock(&vhost_scsi_mutex); list_for_each_entry(tpg, &vhost_scsi_list, tv_tpg_list) { mutex_lock(&tpg->tv_tpg_mutex); if (!tpg->tpg_nexus) { @@ -1590,6 +1554,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (!strcmp(tv_tport->tport_name, t->vhost_wwpn)) { if (vs->vs_tpg && vs->vs_tpg[tpg->tport_tpgt]) { mutex_unlock(&tpg->tv_tpg_mutex); + mutex_unlock(&vhost_scsi_mutex); ret = -EEXIST; goto undepend; } @@ -1604,6 +1569,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, if (ret) { pr_warn("target_depend_item() failed: %d\n", ret); mutex_unlock(&tpg->tv_tpg_mutex); + mutex_unlock(&vhost_scsi_mutex); goto undepend; } tpg->tv_tpg_vhost_count++; @@ -1613,6 +1579,7 @@ vhost_scsi_set_endpoint(struct vhost_scsi *vs, } mutex_unlock(&tpg->tv_tpg_mutex); } + mutex_unlock(&vhost_scsi_mutex); if (match) { memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn, @@ -1658,14 +1625,16 @@ undepend: for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { tpg = vs_tpg[i]; if (tpg) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->vhost_scsi = NULL; tpg->tv_tpg_vhost_count--; + mutex_unlock(&tpg->tv_tpg_mutex); target_undepend_item(&tpg->se_tpg.tpg_group.cg_item); } } kfree(vs_tpg); out: mutex_unlock(&vs->dev.mutex); - mutex_unlock(&vhost_scsi_mutex); return ret; } @@ -1681,7 +1650,6 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, int index, ret, i; u8 target; - mutex_lock(&vhost_scsi_mutex); mutex_lock(&vs->dev.mutex); /* Verify that ring has been setup correctly. */ for (index = 0; index < vs->dev.nvqs; ++index) { @@ -1702,11 +1670,10 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, if (!tpg) continue; - mutex_lock(&tpg->tv_tpg_mutex); tv_tport = tpg->tport; if (!tv_tport) { ret = -ENODEV; - goto err_tpg; + goto err_dev; } if (strcmp(tv_tport->tport_name, t->vhost_wwpn)) { @@ -1715,35 +1682,51 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, tv_tport->tport_name, tpg->tport_tpgt, t->vhost_wwpn, t->vhost_tpgt); ret = -EINVAL; - goto err_tpg; + goto err_dev; } + match = true; + } + if (!match) + goto free_vs_tpg; + + /* Prevent new cmds from starting and accessing the tpgs/sessions */ + for (i = 0; i < vs->dev.nvqs; i++) { + vq = &vs->vqs[i].vq; + mutex_lock(&vq->mutex); + vhost_vq_set_backend(vq, NULL); + mutex_unlock(&vq->mutex); + } + /* Make sure cmds are not running before tearing them down. */ + vhost_scsi_flush(vs); + + for (i = 0; i < vs->dev.nvqs; i++) { + vq = &vs->vqs[i].vq; + vhost_scsi_destroy_vq_cmds(vq); + } + + /* + * We can now release our hold on the tpg and sessions and userspace + * can free them after this point. + */ + for (i = 0; i < VHOST_SCSI_MAX_TARGET; i++) { + target = i; + tpg = vs->vs_tpg[target]; + if (!tpg) + continue; + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_vhost_count--; tpg->vhost_scsi = NULL; vs->vs_tpg[target] = NULL; - match = true; + mutex_unlock(&tpg->tv_tpg_mutex); - /* - * Release se_tpg->tpg_group.cg_item configfs dependency now - * to allow vhost-scsi WWPN se_tpg->tpg_group shutdown to occur. - */ + se_tpg = &tpg->se_tpg; target_undepend_item(&se_tpg->tpg_group.cg_item); } - if (match) { - for (i = 0; i < vs->dev.nvqs; i++) { - vq = &vs->vqs[i].vq; - mutex_lock(&vq->mutex); - vhost_vq_set_backend(vq, NULL); - mutex_unlock(&vq->mutex); - } - /* Make sure cmds are not running before tearing them down. */ - vhost_scsi_flush(vs); - for (i = 0; i < vs->dev.nvqs; i++) { - vq = &vs->vqs[i].vq; - vhost_scsi_destroy_vq_cmds(vq); - } - } +free_vs_tpg: /* * Act as synchronize_rcu to make sure access to * old vs->vs_tpg is finished. @@ -1753,14 +1736,10 @@ vhost_scsi_clear_endpoint(struct vhost_scsi *vs, vs->vs_tpg = NULL; WARN_ON(vs->vs_events_nr); mutex_unlock(&vs->dev.mutex); - mutex_unlock(&vhost_scsi_mutex); return 0; -err_tpg: - mutex_unlock(&tpg->tv_tpg_mutex); err_dev: mutex_unlock(&vs->dev.mutex); - mutex_unlock(&vhost_scsi_mutex); return ret; } @@ -2001,8 +1980,6 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg, if (!vs) return; - mutex_lock(&vs->dev.mutex); - if (plug) reason = VIRTIO_SCSI_EVT_RESET_RESCAN; else @@ -2010,11 +1987,18 @@ vhost_scsi_do_plug(struct vhost_scsi_tpg *tpg, vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq; mutex_lock(&vq->mutex); + /* + * We can't queue events if the backend has been cleared, because + * we could end up queueing an event after the flush. + */ + if (!vhost_vq_get_backend(vq)) + goto unlock; + if (vhost_has_feature(vq, VIRTIO_SCSI_F_HOTPLUG)) vhost_scsi_send_evt(vs, tpg, lun, VIRTIO_SCSI_T_TRANSPORT_RESET, reason); +unlock: mutex_unlock(&vq->mutex); - mutex_unlock(&vs->dev.mutex); } static void vhost_scsi_hotplug(struct vhost_scsi_tpg *tpg, struct se_lun *lun) @@ -2032,24 +2016,11 @@ static int vhost_scsi_port_link(struct se_portal_group *se_tpg, { struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, se_tpg); - struct vhost_scsi_tmf *tmf; - - tmf = kzalloc(sizeof(*tmf), GFP_KERNEL); - if (!tmf) - return -ENOMEM; - INIT_LIST_HEAD(&tmf->queue_entry); - vhost_work_init(&tmf->vwork, vhost_scsi_tmf_resp_work); - - mutex_lock(&vhost_scsi_mutex); mutex_lock(&tpg->tv_tpg_mutex); tpg->tv_tpg_port_count++; - list_add_tail(&tmf->queue_entry, &tpg->tmf_queue); - mutex_unlock(&tpg->tv_tpg_mutex); - vhost_scsi_hotplug(tpg, lun); - - mutex_unlock(&vhost_scsi_mutex); + mutex_unlock(&tpg->tv_tpg_mutex); return 0; } @@ -2059,21 +2030,11 @@ static void vhost_scsi_port_unlink(struct se_portal_group *se_tpg, { struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, se_tpg); - struct vhost_scsi_tmf *tmf; - - mutex_lock(&vhost_scsi_mutex); mutex_lock(&tpg->tv_tpg_mutex); tpg->tv_tpg_port_count--; - tmf = list_first_entry(&tpg->tmf_queue, struct vhost_scsi_tmf, - queue_entry); - list_del(&tmf->queue_entry); - kfree(tmf); - mutex_unlock(&tpg->tv_tpg_mutex); - vhost_scsi_hotunplug(tpg, lun); - - mutex_unlock(&vhost_scsi_mutex); + mutex_unlock(&tpg->tv_tpg_mutex); } static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_store( @@ -2329,7 +2290,6 @@ vhost_scsi_make_tpg(struct se_wwn *wwn, const char *name) } mutex_init(&tpg->tv_tpg_mutex); INIT_LIST_HEAD(&tpg->tv_tpg_list); - INIT_LIST_HEAD(&tpg->tmf_queue); tpg->tport = tport; tpg->tport_tpgt = tpgt; @@ -2460,17 +2420,11 @@ static const struct target_core_fabric_ops vhost_scsi_ops = { .tpg_get_tag = vhost_scsi_get_tpgt, .tpg_check_demo_mode = vhost_scsi_check_true, .tpg_check_demo_mode_cache = vhost_scsi_check_true, - .tpg_check_demo_mode_write_protect = vhost_scsi_check_false, - .tpg_check_prod_mode_write_protect = vhost_scsi_check_false, .tpg_check_prot_fabric_only = vhost_scsi_check_prot_fabric_only, - .tpg_get_inst_index = vhost_scsi_tpg_get_inst_index, .release_cmd = vhost_scsi_release_cmd, .check_stop_free = vhost_scsi_check_stop_free, - .sess_get_index = vhost_scsi_sess_get_index, .sess_get_initiator_sid = NULL, .write_pending = vhost_scsi_write_pending, - .set_default_node_attributes = vhost_scsi_set_default_node_attrs, - .get_cmd_state = vhost_scsi_get_cmd_state, .queue_data_in = vhost_scsi_queue_data_in, .queue_status = vhost_scsi_queue_status, .queue_tm_rsp = vhost_scsi_queue_tm_rsp, diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 7be9d9d8f01c..8c1aefc865f0 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -219,6 +219,28 @@ static int vhost_vdpa_reset(struct vhost_vdpa *v) return vdpa_reset(vdpa); } +static long vhost_vdpa_bind_mm(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!vdpa->use_va || !ops->bind_mm) + return 0; + + return ops->bind_mm(vdpa, v->vdev.mm); +} + +static void vhost_vdpa_unbind_mm(struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (!vdpa->use_va || !ops->unbind_mm) + return; + + ops->unbind_mm(vdpa); +} + static long vhost_vdpa_get_device_id(struct vhost_vdpa *v, u8 __user *argp) { struct vdpa_device *vdpa = v->vdpa; @@ -599,9 +621,11 @@ static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, if (vq->call_ctx.ctx) { cb.callback = vhost_vdpa_virtqueue_cb; cb.private = vq; + cb.trigger = vq->call_ctx.ctx; } else { cb.callback = NULL; cb.private = NULL; + cb.trigger = NULL; } ops->set_vq_cb(vdpa, idx, &cb); vhost_vdpa_setup_vq_irq(v, idx); @@ -716,6 +740,17 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, break; } + if (r) + goto out; + + switch (cmd) { + case VHOST_SET_OWNER: + r = vhost_vdpa_bind_mm(v); + if (r) + vhost_dev_reset_owner(d, NULL); + break; + } +out: mutex_unlock(&d->mutex); return r; } @@ -851,11 +886,7 @@ static void vhost_vdpa_unmap(struct vhost_vdpa *v, if (!v->in_batch) ops->set_map(vdpa, asid, iotlb); } - /* If we are in the middle of batch processing, delay the free - * of AS until BATCH_END. - */ - if (!v->in_batch && !iotlb->nmaps) - vhost_vdpa_remove_as(v, asid); + } static int vhost_vdpa_va_map(struct vhost_vdpa *v, @@ -1112,8 +1143,6 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, u32 asid, if (v->in_batch && ops->set_map) ops->set_map(vdpa, asid, iotlb); v->in_batch = false; - if (!iotlb->nmaps) - vhost_vdpa_remove_as(v, asid); break; default: r = -EINVAL; @@ -1140,7 +1169,7 @@ static int vhost_vdpa_alloc_domain(struct vhost_vdpa *v) struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; struct device *dma_dev = vdpa_get_dma_dev(vdpa); - struct bus_type *bus; + const struct bus_type *bus; int ret; /* Device want to do DMA by itself */ @@ -1287,6 +1316,7 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) vhost_vdpa_clean_irq(v); vhost_vdpa_reset(v); vhost_dev_stop(&v->vdev); + vhost_vdpa_unbind_mm(v); vhost_vdpa_config_put(v); vhost_vdpa_cleanup(v); mutex_unlock(&d->mutex); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f11bdbe4c2c5..a92af08e7864 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -22,11 +22,11 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/kthread.h> -#include <linux/cgroup.h> #include <linux/module.h> #include <linux/sort.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> +#include <linux/sched/vhost_task.h> #include <linux/interval_tree_generic.h> #include <linux/nospec.h> #include <linux/kcov.h> @@ -255,8 +255,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work) * sure it was not in the list. * test_and_set_bit() implies a memory barrier. */ - llist_add(&work->node, &dev->work_list); - wake_up_process(dev->worker); + llist_add(&work->node, &dev->worker->work_list); + wake_up_process(dev->worker->vtsk->task); } } EXPORT_SYMBOL_GPL(vhost_work_queue); @@ -264,7 +264,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue); /* A lockless hint for busy polling code to exit the loop */ bool vhost_has_work(struct vhost_dev *dev) { - return !llist_empty(&dev->work_list); + return dev->worker && !llist_empty(&dev->worker->work_list); } EXPORT_SYMBOL_GPL(vhost_has_work); @@ -335,22 +335,20 @@ static void vhost_vq_reset(struct vhost_dev *dev, static int vhost_worker(void *data) { - struct vhost_dev *dev = data; + struct vhost_worker *worker = data; struct vhost_work *work, *work_next; struct llist_node *node; - kthread_use_mm(dev->mm); - for (;;) { /* mb paired w/ kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) { + if (vhost_task_should_stop(worker->vtsk)) { __set_current_state(TASK_RUNNING); break; } - node = llist_del_all(&dev->work_list); + node = llist_del_all(&worker->work_list); if (!node) schedule(); @@ -360,14 +358,13 @@ static int vhost_worker(void *data) llist_for_each_entry_safe(work, work_next, node, node) { clear_bit(VHOST_WORK_QUEUED, &work->flags); __set_current_state(TASK_RUNNING); - kcov_remote_start_common(dev->kcov_handle); + kcov_remote_start_common(worker->kcov_handle); work->fn(work); kcov_remote_stop(); - if (need_resched()) - schedule(); + cond_resched(); } } - kthread_unuse_mm(dev->mm); + return 0; } @@ -436,8 +433,7 @@ static size_t vhost_get_avail_size(struct vhost_virtqueue *vq, size_t event __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; - return sizeof(*vq->avail) + - sizeof(*vq->avail->ring) * num + event; + return size_add(struct_size(vq->avail, ring, num), event); } static size_t vhost_get_used_size(struct vhost_virtqueue *vq, @@ -446,8 +442,7 @@ static size_t vhost_get_used_size(struct vhost_virtqueue *vq, size_t event __maybe_unused = vhost_has_feature(vq, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; - return sizeof(*vq->used) + - sizeof(*vq->used->ring) * num + event; + return size_add(struct_size(vq->used, ring, num), event); } static size_t vhost_get_desc_size(struct vhost_virtqueue *vq, @@ -479,7 +474,6 @@ void vhost_dev_init(struct vhost_dev *dev, dev->byte_weight = byte_weight; dev->use_worker = use_worker; dev->msg_handler = msg_handler; - init_llist_head(&dev->work_list); init_waitqueue_head(&dev->wait); INIT_LIST_HEAD(&dev->read_list); INIT_LIST_HEAD(&dev->pending_list); @@ -509,31 +503,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_check_owner); -struct vhost_attach_cgroups_struct { - struct vhost_work work; - struct task_struct *owner; - int ret; -}; - -static void vhost_attach_cgroups_work(struct vhost_work *work) -{ - struct vhost_attach_cgroups_struct *s; - - s = container_of(work, struct vhost_attach_cgroups_struct, work); - s->ret = cgroup_attach_task_all(s->owner, current); -} - -static int vhost_attach_cgroups(struct vhost_dev *dev) -{ - struct vhost_attach_cgroups_struct attach; - - attach.owner = current; - vhost_work_init(&attach.work, vhost_attach_cgroups_work); - vhost_work_queue(dev, &attach.work); - vhost_dev_flush(dev); - return attach.ret; -} - /* Caller should have device mutex */ bool vhost_dev_has_owner(struct vhost_dev *dev) { @@ -571,10 +540,54 @@ static void vhost_detach_mm(struct vhost_dev *dev) dev->mm = NULL; } +static void vhost_worker_free(struct vhost_dev *dev) +{ + struct vhost_worker *worker = dev->worker; + + if (!worker) + return; + + dev->worker = NULL; + WARN_ON(!llist_empty(&worker->work_list)); + vhost_task_stop(worker->vtsk); + kfree(worker); +} + +static int vhost_worker_create(struct vhost_dev *dev) +{ + struct vhost_worker *worker; + struct vhost_task *vtsk; + char name[TASK_COMM_LEN]; + int ret; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT); + if (!worker) + return -ENOMEM; + + dev->worker = worker; + worker->kcov_handle = kcov_common_handle(); + init_llist_head(&worker->work_list); + snprintf(name, sizeof(name), "vhost-%d", current->pid); + + vtsk = vhost_task_create(vhost_worker, worker, name); + if (!vtsk) { + ret = -ENOMEM; + goto free_worker; + } + + worker->vtsk = vtsk; + vhost_task_start(vtsk); + return 0; + +free_worker: + kfree(worker); + dev->worker = NULL; + return ret; +} + /* Caller should have device mutex */ long vhost_dev_set_owner(struct vhost_dev *dev) { - struct task_struct *worker; int err; /* Is there an owner already? */ @@ -585,36 +598,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev) vhost_attach_mm(dev); - dev->kcov_handle = kcov_common_handle(); if (dev->use_worker) { - worker = kthread_create(vhost_worker, dev, - "vhost-%d", current->pid); - if (IS_ERR(worker)) { - err = PTR_ERR(worker); - goto err_worker; - } - - dev->worker = worker; - wake_up_process(worker); /* avoid contributing to loadavg */ - - err = vhost_attach_cgroups(dev); + err = vhost_worker_create(dev); if (err) - goto err_cgroup; + goto err_worker; } err = vhost_dev_alloc_iovecs(dev); if (err) - goto err_cgroup; + goto err_iovecs; return 0; -err_cgroup: - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - } +err_iovecs: + vhost_worker_free(dev); err_worker: vhost_detach_mm(dev); - dev->kcov_handle = 0; err_mm: return err; } @@ -705,12 +703,7 @@ void vhost_dev_cleanup(struct vhost_dev *dev) dev->iotlb = NULL; vhost_clear_msg(dev); wake_up_interruptible_poll(&dev->wait, EPOLLIN | EPOLLRDNORM); - WARN_ON(!llist_empty(&dev->work_list)); - if (dev->worker) { - kthread_stop(dev->worker); - dev->worker = NULL; - dev->kcov_handle = 0; - } + vhost_worker_free(dev); vhost_detach_mm(dev); } EXPORT_SYMBOL_GPL(vhost_dev_cleanup); @@ -1831,7 +1824,7 @@ EXPORT_SYMBOL_GPL(vhost_dev_ioctl); /* TODO: This is really inefficient. We need something like get_user() * (instruction directly accesses the data, with an exception table entry - * returning -EFAULT). See Documentation/x86/exception-tables.rst. + * returning -EFAULT). See Documentation/arch/x86/exception-tables.rst. */ static int set_bit_to_user(int nr, void __user *addr) { diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index 1647b750169c..0308638cdeee 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -16,6 +16,7 @@ #include <linux/irqbypass.h> struct vhost_work; +struct vhost_task; typedef void (*vhost_work_fn_t)(struct vhost_work *work); #define VHOST_WORK_QUEUED 1 @@ -25,6 +26,12 @@ struct vhost_work { unsigned long flags; }; +struct vhost_worker { + struct vhost_task *vtsk; + struct llist_head work_list; + u64 kcov_handle; +}; + /* Poll a file (eventfd or socket) */ /* Note: there's nothing vhost specific about this structure. */ struct vhost_poll { @@ -147,8 +154,7 @@ struct vhost_dev { struct vhost_virtqueue **vqs; int nvqs; struct eventfd_ctx *log_ctx; - struct llist_head work_list; - struct task_struct *worker; + struct vhost_worker *worker; struct vhost_iotlb *umem; struct vhost_iotlb *iotlb; spinlock_t iotlb_lock; @@ -158,7 +164,6 @@ struct vhost_dev { int iov_limit; int weight; int byte_weight; - u64 kcov_handle; bool use_worker; int (*msg_handler)(struct vhost_dev *dev, u32 asid, struct vhost_iotlb_msg *msg); diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index a1e27da54481..955d938eb663 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -636,9 +636,9 @@ static inline int xfer_to_user(const struct vringh *vrh, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. - * @desc: the userpace descriptor pointer. - * @avail: the userpace avail pointer. - * @used: the userpace used pointer. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. * * Returns an error if num is invalid: you should check pointers * yourself! @@ -911,9 +911,9 @@ static inline int kern_xfer(const struct vringh *vrh, void *dst, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. - * @desc: the userpace descriptor pointer. - * @avail: the userpace avail pointer. - * @used: the userpace used pointer. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. * * Returns an error if num is invalid. */ @@ -1094,10 +1094,17 @@ EXPORT_SYMBOL(vringh_need_notify_kern); #if IS_REACHABLE(CONFIG_VHOST_IOTLB) +struct iotlb_vec { + union { + struct iovec *iovec; + struct bio_vec *bvec; + } iov; + size_t count; +}; + static int iotlb_translate(const struct vringh *vrh, u64 addr, u64 len, u64 *translated, - struct bio_vec iov[], - int iov_size, u32 perm) + struct iotlb_vec *ivec, u32 perm) { struct vhost_iotlb_map *map; struct vhost_iotlb *iotlb = vrh->iotlb; @@ -1107,9 +1114,11 @@ static int iotlb_translate(const struct vringh *vrh, spin_lock(vrh->iotlb_lock); while (len > s) { - u64 size, pa, pfn; + uintptr_t io_addr; + size_t io_len; + u64 size; - if (unlikely(ret >= iov_size)) { + if (unlikely(ret >= ivec->count)) { ret = -ENOBUFS; break; } @@ -1124,10 +1133,22 @@ static int iotlb_translate(const struct vringh *vrh, } size = map->size - addr + map->start; - pa = map->addr + addr - map->start; - pfn = pa >> PAGE_SHIFT; - bvec_set_page(&iov[ret], pfn_to_page(pfn), min(len - s, size), - pa & (PAGE_SIZE - 1)); + io_len = min(len - s, size); + io_addr = map->addr - map->start + addr; + + if (vrh->use_va) { + struct iovec *iovec = ivec->iov.iovec; + + iovec[ret].iov_len = io_len; + iovec[ret].iov_base = (void __user *)io_addr; + } else { + u64 pfn = io_addr >> PAGE_SHIFT; + struct bio_vec *bvec = ivec->iov.bvec; + + bvec_set_page(&bvec[ret], pfn_to_page(pfn), io_len, + io_addr & (PAGE_SIZE - 1)); + } + s += size; addr += size; ++ret; @@ -1141,26 +1162,41 @@ static int iotlb_translate(const struct vringh *vrh, return ret; } +#define IOTLB_IOV_STRIDE 16 + static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { + struct iotlb_vec ivec; + union { + struct iovec iovec[IOTLB_IOV_STRIDE]; + struct bio_vec bvec[IOTLB_IOV_STRIDE]; + } iov; u64 total_translated = 0; + ivec.iov.iovec = iov.iovec; + ivec.count = IOTLB_IOV_STRIDE; + while (total_translated < len) { - struct bio_vec iov[16]; struct iov_iter iter; u64 translated; int ret; ret = iotlb_translate(vrh, (u64)(uintptr_t)src, len - total_translated, &translated, - iov, ARRAY_SIZE(iov), VHOST_MAP_RO); + &ivec, VHOST_MAP_RO); if (ret == -ENOBUFS) - ret = ARRAY_SIZE(iov); + ret = IOTLB_IOV_STRIDE; else if (ret < 0) return ret; - iov_iter_bvec(&iter, ITER_SOURCE, iov, ret, translated); + if (vrh->use_va) { + iov_iter_init(&iter, ITER_SOURCE, ivec.iov.iovec, ret, + translated); + } else { + iov_iter_bvec(&iter, ITER_SOURCE, ivec.iov.bvec, ret, + translated); + } ret = copy_from_iter(dst, translated, &iter); if (ret < 0) @@ -1177,23 +1213,36 @@ static inline int copy_from_iotlb(const struct vringh *vrh, void *dst, static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, void *src, size_t len) { + struct iotlb_vec ivec; + union { + struct iovec iovec[IOTLB_IOV_STRIDE]; + struct bio_vec bvec[IOTLB_IOV_STRIDE]; + } iov; u64 total_translated = 0; + ivec.iov.iovec = iov.iovec; + ivec.count = IOTLB_IOV_STRIDE; + while (total_translated < len) { - struct bio_vec iov[16]; struct iov_iter iter; u64 translated; int ret; ret = iotlb_translate(vrh, (u64)(uintptr_t)dst, len - total_translated, &translated, - iov, ARRAY_SIZE(iov), VHOST_MAP_WO); + &ivec, VHOST_MAP_WO); if (ret == -ENOBUFS) - ret = ARRAY_SIZE(iov); + ret = IOTLB_IOV_STRIDE; else if (ret < 0) return ret; - iov_iter_bvec(&iter, ITER_DEST, iov, ret, translated); + if (vrh->use_va) { + iov_iter_init(&iter, ITER_DEST, ivec.iov.iovec, ret, + translated); + } else { + iov_iter_bvec(&iter, ITER_DEST, ivec.iov.bvec, ret, + translated); + } ret = copy_to_iter(src, translated, &iter); if (ret < 0) @@ -1210,20 +1259,36 @@ static inline int copy_to_iotlb(const struct vringh *vrh, void *dst, static inline int getu16_iotlb(const struct vringh *vrh, u16 *val, const __virtio16 *p) { - struct bio_vec iov; - void *kaddr, *from; + struct iotlb_vec ivec; + union { + struct iovec iovec[1]; + struct bio_vec bvec[1]; + } iov; + __virtio16 tmp; int ret; + ivec.iov.iovec = iov.iovec; + ivec.count = 1; + /* Atomic read is needed for getu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, - &iov, 1, VHOST_MAP_RO); + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &ivec, VHOST_MAP_RO); if (ret < 0) return ret; - kaddr = kmap_atomic(iov.bv_page); - from = kaddr + iov.bv_offset; - *val = vringh16_to_cpu(vrh, READ_ONCE(*(__virtio16 *)from)); - kunmap_atomic(kaddr); + if (vrh->use_va) { + ret = __get_user(tmp, (__virtio16 __user *)ivec.iov.iovec[0].iov_base); + if (ret) + return ret; + } else { + void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); + void *from = kaddr + ivec.iov.bvec[0].bv_offset; + + tmp = READ_ONCE(*(__virtio16 *)from); + kunmap_local(kaddr); + } + + *val = vringh16_to_cpu(vrh, tmp); return 0; } @@ -1231,20 +1296,36 @@ static inline int getu16_iotlb(const struct vringh *vrh, static inline int putu16_iotlb(const struct vringh *vrh, __virtio16 *p, u16 val) { - struct bio_vec iov; - void *kaddr, *to; + struct iotlb_vec ivec; + union { + struct iovec iovec; + struct bio_vec bvec; + } iov; + __virtio16 tmp; int ret; + ivec.iov.iovec = &iov.iovec; + ivec.count = 1; + /* Atomic write is needed for putu16 */ - ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), NULL, - &iov, 1, VHOST_MAP_WO); + ret = iotlb_translate(vrh, (u64)(uintptr_t)p, sizeof(*p), + NULL, &ivec, VHOST_MAP_RO); if (ret < 0) return ret; - kaddr = kmap_atomic(iov.bv_page); - to = kaddr + iov.bv_offset; - WRITE_ONCE(*(__virtio16 *)to, cpu_to_vringh16(vrh, val)); - kunmap_atomic(kaddr); + tmp = cpu_to_vringh16(vrh, val); + + if (vrh->use_va) { + ret = __put_user(tmp, (__virtio16 __user *)ivec.iov.iovec[0].iov_base); + if (ret) + return ret; + } else { + void *kaddr = kmap_local_page(ivec.iov.bvec[0].bv_page); + void *to = kaddr + ivec.iov.bvec[0].bv_offset; + + WRITE_ONCE(*(__virtio16 *)to, tmp); + kunmap_local(kaddr); + } return 0; } @@ -1306,9 +1387,9 @@ static inline int putused_iotlb(const struct vringh *vrh, * @features: the feature bits for this ring. * @num: the number of elements. * @weak_barriers: true if we only need memory barriers, not I/O. - * @desc: the userpace descriptor pointer. - * @avail: the userpace avail pointer. - * @used: the userpace used pointer. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. * * Returns an error if num is invalid. */ @@ -1318,12 +1399,40 @@ int vringh_init_iotlb(struct vringh *vrh, u64 features, struct vring_avail *avail, struct vring_used *used) { + vrh->use_va = false; + return vringh_init_kern(vrh, features, num, weak_barriers, desc, avail, used); } EXPORT_SYMBOL(vringh_init_iotlb); /** + * vringh_init_iotlb_va - initialize a vringh for a ring with IOTLB containing + * user VA. + * @vrh: the vringh to initialize. + * @features: the feature bits for this ring. + * @num: the number of elements. + * @weak_barriers: true if we only need memory barriers, not I/O. + * @desc: the userspace descriptor pointer. + * @avail: the userspace avail pointer. + * @used: the userspace used pointer. + * + * Returns an error if num is invalid. + */ +int vringh_init_iotlb_va(struct vringh *vrh, u64 features, + unsigned int num, bool weak_barriers, + struct vring_desc *desc, + struct vring_avail *avail, + struct vring_used *used) +{ + vrh->use_va = true; + + return vringh_init_kern(vrh, features, num, weak_barriers, + desc, avail, used); +} +EXPORT_SYMBOL(vringh_init_iotlb_va); + +/** * vringh_set_iotlb - initialize a vringh for a ring with IOTLB. * @vrh: the vring * @iotlb: iotlb associated with this vring diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index c8e6087769a1..6578db78f0ae 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -439,6 +439,7 @@ static struct virtio_transport vhost_transport = { .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, + .read_skb = virtio_transport_read_skb, }, .send_pkt = vhost_transport_send_pkt, |