summaryrefslogtreecommitdiffstats
path: root/drivers/vfio/vfio_iommu_type1.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-02-25 11:52:57 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2023-02-25 11:52:57 -0800
commitcac85e4616b1cf4a90844b952b49b9cbc4562530 (patch)
tree1d3d5f82188310497ced1a0ade2ab3b7a4346201 /drivers/vfio/vfio_iommu_type1.c
parent84cc6674b76ba2cdac0df8037b4d8a22a6fc1b77 (diff)
parentd649c34cb916b015fdcb487e51409fcc5caeca8d (diff)
downloadlinux-stable-cac85e4616b1cf4a90844b952b49b9cbc4562530.tar.gz
linux-stable-cac85e4616b1cf4a90844b952b49b9cbc4562530.tar.bz2
linux-stable-cac85e4616b1cf4a90844b952b49b9cbc4562530.zip
Merge tag 'vfio-v6.3-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Remove redundant resource check in vfio-platform (Angus Chen) - Use GFP_KERNEL_ACCOUNT for persistent userspace allocations, allowing removal of arbitrary kernel limits in favor of cgroup control (Yishai Hadas) - mdev tidy-ups, including removing the module-only build restriction for sample drivers, Kconfig changes to select mdev support, documentation movement to keep sample driver usage instructions with sample drivers rather than with API docs, remove references to out-of-tree drivers in docs (Christoph Hellwig) - Fix collateral breakages from mdev Kconfig changes (Arnd Bergmann) - Make mlx5 migration support match device support, improve source and target flows to improve pre-copy support and reduce downtime (Yishai Hadas) - Convert additional mdev sysfs case to use sysfs_emit() (Bo Liu) - Resolve copy-paste error in mdev mbochs sample driver Kconfig (Ye Xingchen) - Avoid propagating missing reset error in vfio-platform if reset requirement is relaxed by module option (Tomasz Duszynski) - Range size fixes in mlx5 variant driver for missed last byte and stricter range calculation (Yishai Hadas) - Fixes to suspended vaddr support and locked_vm accounting, excluding mdev configurations from the former due to potential to indefinitely block kernel threads, fix underflow and restore locked_vm on new mm (Steve Sistare) - Update outdated vfio documentation due to new IOMMUFD interfaces in recent kernels (Yi Liu) - Resolve deadlock between group_lock and kvm_lock, finally (Matthew Rosato) - Fix NULL pointer in group initialization error path with IOMMUFD (Yan Zhao) * tag 'vfio-v6.3-rc1' of https://github.com/awilliam/linux-vfio: (32 commits) vfio: Fix NULL pointer dereference caused by uninitialized group->iommufd docs: vfio: Update vfio.rst per latest interfaces vfio: Update the kdoc for vfio_device_ops vfio/mlx5: Fix range size calculation upon tracker creation vfio: no need to pass kvm pointer during device open vfio: fix deadlock between group lock and kvm lock vfio: revert "iommu driver notify callback" vfio/type1: revert "implement notify callback" vfio/type1: revert "block on invalid vaddr" vfio/type1: restore locked_vm vfio/type1: track locked_vm per dma vfio/type1: prevent underflow of locked_vm via exec() vfio/type1: exclude mdevs from VFIO_UPDATE_VADDR vfio: platform: ignore missing reset if disabled at module init vfio/mlx5: Improve the target side flow to reduce downtime vfio/mlx5: Improve the source side flow upon pre_copy vfio/mlx5: Check whether VF is migratable samples: fix the prompt about SAMPLE_VFIO_MDEV_MBOCHS vfio/mdev: Use sysfs_emit() to instead of sprintf() vfio-mdev: add back CONFIG_VFIO dependency ...
Diffstat (limited to 'drivers/vfio/vfio_iommu_type1.c')
-rw-r--r--drivers/vfio/vfio_iommu_type1.c248
1 files changed, 111 insertions, 137 deletions
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a44ac3fe657c..493c31de0edb 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -71,11 +71,9 @@ struct vfio_iommu {
unsigned int vaddr_invalid_count;
uint64_t pgsize_bitmap;
uint64_t num_non_pinned_groups;
- wait_queue_head_t vaddr_wait;
bool v2;
bool nesting;
bool dirty_page_tracking;
- bool container_open;
struct list_head emulated_iommu_groups;
};
@@ -99,6 +97,8 @@ struct vfio_dma {
struct task_struct *task;
struct rb_root pfn_list; /* Ex-user pinned pfn list */
unsigned long *bitmap;
+ struct mm_struct *mm;
+ size_t locked_vm;
};
struct vfio_batch {
@@ -151,8 +151,6 @@ struct vfio_regions {
#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
-#define WAITED 1
-
static int put_pfn(unsigned long pfn, int prot);
static struct vfio_iommu_group*
@@ -411,6 +409,19 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
return ret;
}
+static int mm_lock_acct(struct task_struct *task, struct mm_struct *mm,
+ bool lock_cap, long npage)
+{
+ int ret = mmap_write_lock_killable(mm);
+
+ if (ret)
+ return ret;
+
+ ret = __account_locked_vm(mm, abs(npage), npage > 0, task, lock_cap);
+ mmap_write_unlock(mm);
+ return ret;
+}
+
static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
{
struct mm_struct *mm;
@@ -419,16 +430,13 @@ static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
if (!npage)
return 0;
- mm = async ? get_task_mm(dma->task) : dma->task->mm;
- if (!mm)
+ mm = dma->mm;
+ if (async && !mmget_not_zero(mm))
return -ESRCH; /* process exited */
- ret = mmap_write_lock_killable(mm);
- if (!ret) {
- ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
- dma->lock_cap);
- mmap_write_unlock(mm);
- }
+ ret = mm_lock_acct(dma->task, mm, dma->lock_cap, npage);
+ if (!ret)
+ dma->locked_vm += npage;
if (async)
mmput(mm);
@@ -594,61 +602,6 @@ done:
return ret;
}
-static int vfio_wait(struct vfio_iommu *iommu)
-{
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
- mutex_unlock(&iommu->lock);
- schedule();
- mutex_lock(&iommu->lock);
- finish_wait(&iommu->vaddr_wait, &wait);
- if (kthread_should_stop() || !iommu->container_open ||
- fatal_signal_pending(current)) {
- return -EFAULT;
- }
- return WAITED;
-}
-
-/*
- * Find dma struct and wait for its vaddr to be valid. iommu lock is dropped
- * if the task waits, but is re-locked on return. Return result in *dma_p.
- * Return 0 on success with no waiting, WAITED on success if waited, and -errno
- * on error.
- */
-static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
- size_t size, struct vfio_dma **dma_p)
-{
- int ret = 0;
-
- do {
- *dma_p = vfio_find_dma(iommu, start, size);
- if (!*dma_p)
- return -EINVAL;
- else if (!(*dma_p)->vaddr_invalid)
- return ret;
- else
- ret = vfio_wait(iommu);
- } while (ret == WAITED);
-
- return ret;
-}
-
-/*
- * Wait for all vaddr in the dma_list to become valid. iommu lock is dropped
- * if the task waits, but is re-locked on return. Return 0 on success with no
- * waiting, WAITED on success if waited, and -errno on error.
- */
-static int vfio_wait_all_valid(struct vfio_iommu *iommu)
-{
- int ret = 0;
-
- while (iommu->vaddr_invalid_count && ret >= 0)
- ret = vfio_wait(iommu);
-
- return ret;
-}
-
/*
* Attempt to pin pages. We really don't want to track all the pfns and
* the iommu can only map chunks of consecutive pfns anyway, so get the
@@ -793,8 +746,8 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
struct mm_struct *mm;
int ret;
- mm = get_task_mm(dma->task);
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -ENODEV;
ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
@@ -804,7 +757,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
ret = 0;
if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
- ret = vfio_lock_acct(dma, 1, true);
+ ret = vfio_lock_acct(dma, 1, false);
if (ret) {
put_pfn(*pfn_base, dma->prot);
if (ret == -ENOMEM)
@@ -849,7 +802,6 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
unsigned long remote_vaddr;
struct vfio_dma *dma;
bool do_accounting;
- dma_addr_t iova;
if (!iommu || !pages)
return -EINVAL;
@@ -860,20 +812,10 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
mutex_lock(&iommu->lock);
- /*
- * Wait for all necessary vaddr's to be valid so they can be used in
- * the main loop without dropping the lock, to avoid racing vs unmap.
- */
-again:
- if (iommu->vaddr_invalid_count) {
- for (i = 0; i < npage; i++) {
- iova = user_iova + PAGE_SIZE * i;
- ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
- if (ret < 0)
- goto pin_done;
- if (ret == WAITED)
- goto again;
- }
+ if (WARN_ONCE(iommu->vaddr_invalid_count,
+ "vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
+ ret = -EBUSY;
+ goto pin_done;
}
/* Fail if no dma_umap notifier is registered */
@@ -891,6 +833,7 @@ again:
for (i = 0; i < npage; i++) {
unsigned long phys_pfn;
+ dma_addr_t iova;
struct vfio_pfn *vpfn;
iova = user_iova + PAGE_SIZE * i;
@@ -1173,11 +1116,10 @@ static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
vfio_unmap_unpin(iommu, dma, true);
vfio_unlink_dma(iommu, dma);
put_task_struct(dma->task);
+ mmdrop(dma->mm);
vfio_dma_bitmap_free(dma);
- if (dma->vaddr_invalid) {
+ if (dma->vaddr_invalid)
iommu->vaddr_invalid_count--;
- wake_up_all(&iommu->vaddr_wait);
- }
kfree(dma);
iommu->dma_avail++;
}
@@ -1342,6 +1284,12 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
mutex_lock(&iommu->lock);
+ /* Cannot update vaddr if mdev is present. */
+ if (invalidate_vaddr && !list_empty(&iommu->emulated_iommu_groups)) {
+ ret = -EBUSY;
+ goto unlock;
+ }
+
pgshift = __ffs(iommu->pgsize_bitmap);
pgsize = (size_t)1 << pgshift;
@@ -1566,6 +1514,38 @@ static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
return list_empty(iova);
}
+static int vfio_change_dma_owner(struct vfio_dma *dma)
+{
+ struct task_struct *task = current->group_leader;
+ struct mm_struct *mm = current->mm;
+ long npage = dma->locked_vm;
+ bool lock_cap;
+ int ret;
+
+ if (mm == dma->mm)
+ return 0;
+
+ lock_cap = capable(CAP_IPC_LOCK);
+ ret = mm_lock_acct(task, mm, lock_cap, npage);
+ if (ret)
+ return ret;
+
+ if (mmget_not_zero(dma->mm)) {
+ mm_lock_acct(dma->task, dma->mm, dma->lock_cap, -npage);
+ mmput(dma->mm);
+ }
+
+ if (dma->task != task) {
+ put_task_struct(dma->task);
+ dma->task = get_task_struct(task);
+ }
+ mmdrop(dma->mm);
+ dma->mm = mm;
+ mmgrab(dma->mm);
+ dma->lock_cap = lock_cap;
+ return 0;
+}
+
static int vfio_dma_do_map(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map *map)
{
@@ -1615,10 +1595,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
dma->size != size) {
ret = -EINVAL;
} else {
+ ret = vfio_change_dma_owner(dma);
+ if (ret)
+ goto out_unlock;
dma->vaddr = vaddr;
dma->vaddr_invalid = false;
iommu->vaddr_invalid_count--;
- wake_up_all(&iommu->vaddr_wait);
}
goto out_unlock;
} else if (dma) {
@@ -1652,29 +1634,15 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
* against the locked memory limit and we need to be able to do both
* outside of this call path as pinning can be asynchronous via the
* external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
- * task_struct and VM locked pages requires an mm_struct, however
- * holding an indefinite mm reference is not recommended, therefore we
- * only hold a reference to a task. We could hold a reference to
- * current, however QEMU uses this call path through vCPU threads,
- * which can be killed resulting in a NULL mm and failure in the unmap
- * path when called via a different thread. Avoid this problem by
- * using the group_leader as threads within the same group require
- * both CLONE_THREAD and CLONE_VM and will therefore use the same
- * mm_struct.
- *
- * Previously we also used the task for testing CAP_IPC_LOCK at the
- * time of pinning and accounting, however has_capability() makes use
- * of real_cred, a copy-on-write field, so we can't guarantee that it
- * matches group_leader, or in fact that it might not change by the
- * time it's evaluated. If a process were to call MAP_DMA with
- * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
- * possibly see different results for an iommu_mapped vfio_dma vs
- * externally mapped. Therefore track CAP_IPC_LOCK in vfio_dma at the
- * time of calling MAP_DMA.
+ * task_struct. Save the group_leader so that all DMA tracking uses
+ * the same task, to make debugging easier. VM locked pages requires
+ * an mm_struct, so grab the mm in case the task dies.
*/
get_task_struct(current->group_leader);
dma->task = current->group_leader;
dma->lock_cap = capable(CAP_IPC_LOCK);
+ dma->mm = current->mm;
+ mmgrab(dma->mm);
dma->pfn_list = RB_ROOT;
@@ -1707,10 +1675,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
int ret;
- ret = vfio_wait_all_valid(iommu);
- if (ret < 0)
- return ret;
-
/* Arbitrarily pick the first domain in the list for lookups */
if (!list_empty(&iommu->domain_list))
d = list_first_entry(&iommu->domain_list,
@@ -2188,11 +2152,16 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
struct iommu_domain_geometry *geo;
LIST_HEAD(iova_copy);
LIST_HEAD(group_resv_regions);
- int ret = -EINVAL;
+ int ret = -EBUSY;
mutex_lock(&iommu->lock);
+ /* Attach could require pinning, so disallow while vaddr is invalid. */
+ if (iommu->vaddr_invalid_count)
+ goto out_unlock;
+
/* Check for duplicates */
+ ret = -EINVAL;
if (vfio_iommu_find_iommu_group(iommu, iommu_group))
goto out_unlock;
@@ -2592,11 +2561,9 @@ static void *vfio_iommu_type1_open(unsigned long arg)
INIT_LIST_HEAD(&iommu->iova_list);
iommu->dma_list = RB_ROOT;
iommu->dma_avail = dma_entry_limit;
- iommu->container_open = true;
mutex_init(&iommu->lock);
mutex_init(&iommu->device_list_lock);
INIT_LIST_HEAD(&iommu->device_list);
- init_waitqueue_head(&iommu->vaddr_wait);
iommu->pgsize_bitmap = PAGE_MASK;
INIT_LIST_HEAD(&iommu->emulated_iommu_groups);
@@ -2660,6 +2627,16 @@ static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
return ret;
}
+static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
+{
+ bool ret;
+
+ mutex_lock(&iommu->lock);
+ ret = !list_empty(&iommu->emulated_iommu_groups);
+ mutex_unlock(&iommu->lock);
+ return ret;
+}
+
static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
unsigned long arg)
{
@@ -2668,8 +2645,13 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_NESTING_IOMMU:
case VFIO_UNMAP_ALL:
- case VFIO_UPDATE_VADDR:
return 1;
+ case VFIO_UPDATE_VADDR:
+ /*
+ * Disable this feature if mdevs are present. They cannot
+ * safely pin/unpin/rw while vaddrs are being updated.
+ */
+ return iommu && !vfio_iommu_has_emulated(iommu);
case VFIO_DMA_CC_IOMMU:
if (!iommu)
return 0;
@@ -3078,21 +3060,19 @@ static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
struct vfio_dma *dma;
bool kthread = current->mm == NULL;
size_t offset;
- int ret;
*copied = 0;
- ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
- if (ret < 0)
- return ret;
+ dma = vfio_find_dma(iommu, user_iova, 1);
+ if (!dma)
+ return -EINVAL;
if ((write && !(dma->prot & IOMMU_WRITE)) ||
!(dma->prot & IOMMU_READ))
return -EPERM;
- mm = get_task_mm(dma->task);
-
- if (!mm)
+ mm = dma->mm;
+ if (!mmget_not_zero(mm))
return -EPERM;
if (kthread)
@@ -3138,6 +3118,13 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
size_t done;
mutex_lock(&iommu->lock);
+
+ if (WARN_ONCE(iommu->vaddr_invalid_count,
+ "vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
+ ret = -EBUSY;
+ goto out;
+ }
+
while (count > 0) {
ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
count, write, &done);
@@ -3149,6 +3136,7 @@ static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
user_iova += done;
}
+out:
mutex_unlock(&iommu->lock);
return ret;
}
@@ -3176,19 +3164,6 @@ vfio_iommu_type1_group_iommu_domain(void *iommu_data,
return domain;
}
-static void vfio_iommu_type1_notify(void *iommu_data,
- enum vfio_iommu_notify_type event)
-{
- struct vfio_iommu *iommu = iommu_data;
-
- if (event != VFIO_IOMMU_CONTAINER_CLOSE)
- return;
- mutex_lock(&iommu->lock);
- iommu->container_open = false;
- mutex_unlock(&iommu->lock);
- wake_up_all(&iommu->vaddr_wait);
-}
-
static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.name = "vfio-iommu-type1",
.owner = THIS_MODULE,
@@ -3203,7 +3178,6 @@ static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
.unregister_device = vfio_iommu_type1_unregister_device,
.dma_rw = vfio_iommu_type1_dma_rw,
.group_iommu_domain = vfio_iommu_type1_group_iommu_domain,
- .notify = vfio_iommu_type1_notify,
};
static int __init vfio_iommu_type1_init(void)