From 6c38c055cc4c0a5da31873d173b2de3085f43f33 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:31 -0700 Subject: vfio/type1: Restore mapping performance with mdev support As part of the mdev support, type1 now gets a task reference per vfio_dma and uses that to get an mm reference for the task while working on accounting. That's correct, but it's not fast. For some paths, like vfio_pin_pages_remote(), we know we're only called from user context, so we can restore the lighter weight calls. In other cases, we're effectively already testing whether we're in the stored task context elsewhere, extend this vfio_lock_acct() as well. Signed-off-by: Alex Williamson Reviewed by: Kirti Wankhede --- drivers/vfio/vfio_iommu_type1.c | 98 +++++++++++++++++++++-------------------- 1 file changed, 51 insertions(+), 47 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index f3726ba12aa6..9266271a787a 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -268,28 +268,38 @@ static void vfio_lock_acct(struct task_struct *task, long npage) { struct vwork *vwork; struct mm_struct *mm; + bool is_current; if (!npage) return; - mm = get_task_mm(task); + is_current = (task->mm == current->mm); + + mm = is_current ? task->mm : get_task_mm(task); if (!mm) - return; /* process exited or nothing to do */ + return; /* process exited */ if (down_write_trylock(&mm->mmap_sem)) { mm->locked_vm += npage; up_write(&mm->mmap_sem); - mmput(mm); + if (!is_current) + mmput(mm); return; } + if (is_current) { + mm = get_task_mm(task); + if (!mm) + return; + } + /* * Couldn't get mmap_sem lock, so must setup to update * mm->locked_vm later. If locked_vm were atomic, we * wouldn't need this silliness */ vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); - if (!vwork) { + if (WARN_ON(!vwork)) { mmput(mm); return; } @@ -393,77 +403,71 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, long npage, unsigned long *pfn_base) { - unsigned long limit; - bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, - CAP_IPC_LOCK); - struct mm_struct *mm; - long ret, i = 0, lock_acct = 0; + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); + long ret, pinned = 0, lock_acct = 0; bool rsvd; dma_addr_t iova = vaddr - dma->vaddr + dma->iova; - mm = get_task_mm(dma->task); - if (!mm) + /* This code path is only user initiated */ + if (!current->mm) return -ENODEV; - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); + ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base); if (ret) - goto pin_pg_remote_exit; + return ret; + pinned++; rsvd = is_invalid_reserved_pfn(*pfn_base); - limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; /* * Reserved pages aren't counted against the user, externally pinned * pages are already counted against the user. */ if (!rsvd && !vfio_find_vpfn(dma, iova)) { - if (!lock_cap && mm->locked_vm + 1 > limit) { + if (!lock_cap && current->mm->locked_vm + 1 > limit) { put_pfn(*pfn_base, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); - ret = -ENOMEM; - goto pin_pg_remote_exit; + return -ENOMEM; } lock_acct++; } - i++; - if (likely(!disable_hugepages)) { - /* Lock all the consecutive pages from pfn_base */ - for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage; - i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { - unsigned long pfn = 0; + if (unlikely(disable_hugepages)) + goto out; - ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn); - if (ret) - break; + /* Lock all the consecutive pages from pfn_base */ + for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; + pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { + unsigned long pfn = 0; - if (pfn != *pfn_base + i || - rsvd != is_invalid_reserved_pfn(pfn)) { + ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); + if (ret) + break; + + if (pfn != *pfn_base + pinned || + rsvd != is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, dma->prot); + break; + } + + if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!lock_cap && + current->mm->locked_vm + lock_acct + 1 > limit) { put_pfn(pfn, dma->prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, limit << PAGE_SHIFT); break; } - - if (!rsvd && !vfio_find_vpfn(dma, iova)) { - if (!lock_cap && - mm->locked_vm + lock_acct + 1 > limit) { - put_pfn(pfn, dma->prot); - pr_warn("%s: RLIMIT_MEMLOCK (%ld) " - "exceeded\n", __func__, - limit << PAGE_SHIFT); - break; - } - lock_acct++; - } + lock_acct++; } } - vfio_lock_acct(dma->task, lock_acct); - ret = i; +out: + vfio_lock_acct(current, lock_acct); -pin_pg_remote_exit: - mmput(mm); - return ret; + return pinned; } static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, @@ -473,10 +477,10 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, long unlocked = 0, locked = 0; long i; - for (i = 0; i < npage; i++) { + for (i = 0; i < npage; i++, iova += PAGE_SIZE) { if (put_pfn(pfn++, dma->prot)) { unlocked++; - if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT))) + if (vfio_find_vpfn(dma, iova)) locked++; } } -- cgit v1.2.3 From 49550787a90b5bfa44d8dc424d11824dbe21473d Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:33 -0700 Subject: vfio-mdev: Fix remove race Using the mtty mdev sample driver we can generate a remove race by starting one shell that continuously creates mtty devices and several other shells all attempting to remove devices, in my case four remove shells. The fault occurs in mdev_remove_sysfs_files() where the passed type arg is NULL, which suggests we've received a struct device in mdev_device_remove() but it's in some sort of teardown state. The solution here is to make use of the accidentally unused list_head on the mdev_device such that the mdev core keeps a list of all the mdev devices. This allows us to validate that we have a valid mdev before we start removal, remove it from the list to prevent others from working on it, and if the vendor driver refuses to remove, we can re-add it to the list. Cc: Kirti Wankhede Signed-off-by: Alex Williamson --- drivers/vfio/mdev/mdev_core.c | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index be1ee89ee917..6bb4d4c469ab 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -27,6 +27,9 @@ static LIST_HEAD(parent_list); static DEFINE_MUTEX(parent_list_lock); static struct class_compat *mdev_bus_compat_class; +static LIST_HEAD(mdev_list); +static DEFINE_MUTEX(mdev_list_lock); + static int _find_mdev_device(struct device *dev, void *data) { struct mdev_device *mdev; @@ -316,6 +319,11 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) dev_dbg(&mdev->dev, "MDEV: created\n"); mutex_unlock(&parent->lock); + + mutex_lock(&mdev_list_lock); + list_add(&mdev->next, &mdev_list); + mutex_unlock(&mdev_list_lock); + return ret; create_failed: @@ -329,12 +337,30 @@ create_err: int mdev_device_remove(struct device *dev, bool force_remove) { - struct mdev_device *mdev; + struct mdev_device *mdev, *tmp; struct parent_device *parent; struct mdev_type *type; int ret; + bool found = false; mdev = to_mdev_device(dev); + + mutex_lock(&mdev_list_lock); + list_for_each_entry(tmp, &mdev_list, next) { + if (tmp == mdev) { + found = true; + break; + } + } + + if (found) + list_del(&mdev->next); + + mutex_unlock(&mdev_list_lock); + + if (!found) + return -ENODEV; + type = to_mdev_type(mdev->type_kobj); parent = mdev->parent; mutex_lock(&parent->lock); @@ -342,6 +368,11 @@ int mdev_device_remove(struct device *dev, bool force_remove) ret = mdev_device_remove_ops(mdev, force_remove); if (ret) { mutex_unlock(&parent->lock); + + mutex_lock(&mdev_list_lock); + list_add(&mdev->next, &mdev_list); + mutex_unlock(&mdev_list_lock); + return ret; } @@ -349,7 +380,8 @@ int mdev_device_remove(struct device *dev, bool force_remove) device_unregister(dev); mutex_unlock(&parent->lock); mdev_put_parent(parent); - return ret; + + return 0; } static int __init mdev_init(void) -- cgit v1.2.3 From 42930553a7c11f06351bc08b889808d0f6020f08 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:38 -0700 Subject: vfio-mdev: de-polute the namespace, rename parent_device & parent_ops Add an mdev_ prefix so we're not poluting the namespace so much. Cc: Zhenyu Wang Cc: Zhi Wang Cc: Jike Song Signed-off-by: Alex Williamson Reviewed by: Kirti Wankhede --- drivers/vfio/mdev/mdev_core.c | 28 ++++++++++++++-------------- drivers/vfio/mdev/mdev_private.h | 6 +++--- drivers/vfio/mdev/mdev_sysfs.c | 8 ++++---- drivers/vfio/mdev/vfio_mdev.c | 12 ++++++------ 4 files changed, 27 insertions(+), 27 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 6bb4d4c469ab..bf3b3b0b3d2b 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -45,7 +45,7 @@ static int _find_mdev_device(struct device *dev, void *data) return 0; } -static bool mdev_device_exist(struct parent_device *parent, uuid_le uuid) +static bool mdev_device_exist(struct mdev_parent *parent, uuid_le uuid) { struct device *dev; @@ -59,9 +59,9 @@ static bool mdev_device_exist(struct parent_device *parent, uuid_le uuid) } /* Should be called holding parent_list_lock */ -static struct parent_device *__find_parent_device(struct device *dev) +static struct mdev_parent *__find_parent_device(struct device *dev) { - struct parent_device *parent; + struct mdev_parent *parent; list_for_each_entry(parent, &parent_list, next) { if (parent->dev == dev) @@ -72,8 +72,8 @@ static struct parent_device *__find_parent_device(struct device *dev) static void mdev_release_parent(struct kref *kref) { - struct parent_device *parent = container_of(kref, struct parent_device, - ref); + struct mdev_parent *parent = container_of(kref, struct mdev_parent, + ref); struct device *dev = parent->dev; kfree(parent); @@ -81,7 +81,7 @@ static void mdev_release_parent(struct kref *kref) } static -inline struct parent_device *mdev_get_parent(struct parent_device *parent) +inline struct mdev_parent *mdev_get_parent(struct mdev_parent *parent) { if (parent) kref_get(&parent->ref); @@ -89,7 +89,7 @@ inline struct parent_device *mdev_get_parent(struct parent_device *parent) return parent; } -static inline void mdev_put_parent(struct parent_device *parent) +static inline void mdev_put_parent(struct mdev_parent *parent) { if (parent) kref_put(&parent->ref, mdev_release_parent); @@ -98,7 +98,7 @@ static inline void mdev_put_parent(struct parent_device *parent) static int mdev_device_create_ops(struct kobject *kobj, struct mdev_device *mdev) { - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; int ret; ret = parent->ops->create(kobj, mdev); @@ -125,7 +125,7 @@ static int mdev_device_create_ops(struct kobject *kobj, */ static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove) { - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; int ret; /* @@ -156,10 +156,10 @@ static int mdev_device_remove_cb(struct device *dev, void *data) * Add device to list of registered parent devices. * Returns a negative value on error, otherwise 0. */ -int mdev_register_device(struct device *dev, const struct parent_ops *ops) +int mdev_register_device(struct device *dev, const struct mdev_parent_ops *ops) { int ret; - struct parent_device *parent; + struct mdev_parent *parent; /* check for mandatory ops */ if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups) @@ -232,7 +232,7 @@ EXPORT_SYMBOL(mdev_register_device); void mdev_unregister_device(struct device *dev) { - struct parent_device *parent; + struct mdev_parent *parent; bool force_remove = true; mutex_lock(&parent_list_lock); @@ -269,7 +269,7 @@ int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) { int ret; struct mdev_device *mdev; - struct parent_device *parent; + struct mdev_parent *parent; struct mdev_type *type = to_mdev_type(kobj); parent = mdev_get_parent(type->parent); @@ -338,7 +338,7 @@ create_err: int mdev_device_remove(struct device *dev, bool force_remove) { struct mdev_device *mdev, *tmp; - struct parent_device *parent; + struct mdev_parent *parent; struct mdev_type *type; int ret; bool found = false; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index d35097cbf3d7..0b72c2d9ee40 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -19,7 +19,7 @@ void mdev_bus_unregister(void); struct mdev_type { struct kobject kobj; struct kobject *devices_kobj; - struct parent_device *parent; + struct mdev_parent *parent; struct list_head next; struct attribute_group *group; }; @@ -29,8 +29,8 @@ struct mdev_type { #define to_mdev_type(_kobj) \ container_of(_kobj, struct mdev_type, kobj) -int parent_create_sysfs_files(struct parent_device *parent); -void parent_remove_sysfs_files(struct parent_device *parent); +int parent_create_sysfs_files(struct mdev_parent *parent); +void parent_remove_sysfs_files(struct mdev_parent *parent); int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type); void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type); diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c index 1a53deb2ee10..802df210929b 100644 --- a/drivers/vfio/mdev/mdev_sysfs.c +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -92,7 +92,7 @@ static struct kobj_type mdev_type_ktype = { .release = mdev_type_release, }; -struct mdev_type *add_mdev_supported_type(struct parent_device *parent, +struct mdev_type *add_mdev_supported_type(struct mdev_parent *parent, struct attribute_group *group) { struct mdev_type *type; @@ -158,7 +158,7 @@ static void remove_mdev_supported_type(struct mdev_type *type) kobject_put(&type->kobj); } -static int add_mdev_supported_type_groups(struct parent_device *parent) +static int add_mdev_supported_type_groups(struct mdev_parent *parent) { int i; @@ -183,7 +183,7 @@ static int add_mdev_supported_type_groups(struct parent_device *parent) } /* mdev sysfs functions */ -void parent_remove_sysfs_files(struct parent_device *parent) +void parent_remove_sysfs_files(struct mdev_parent *parent) { struct mdev_type *type, *tmp; @@ -196,7 +196,7 @@ void parent_remove_sysfs_files(struct parent_device *parent) kset_unregister(parent->mdev_types_kset); } -int parent_create_sysfs_files(struct parent_device *parent) +int parent_create_sysfs_files(struct mdev_parent *parent) { int ret; diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c index ffc36758cb84..fa848a701b8b 100644 --- a/drivers/vfio/mdev/vfio_mdev.c +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -27,7 +27,7 @@ static int vfio_mdev_open(void *device_data) { struct mdev_device *mdev = device_data; - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; int ret; if (unlikely(!parent->ops->open)) @@ -46,7 +46,7 @@ static int vfio_mdev_open(void *device_data) static void vfio_mdev_release(void *device_data) { struct mdev_device *mdev = device_data; - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; if (likely(parent->ops->release)) parent->ops->release(mdev); @@ -58,7 +58,7 @@ static long vfio_mdev_unlocked_ioctl(void *device_data, unsigned int cmd, unsigned long arg) { struct mdev_device *mdev = device_data; - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->ioctl)) return -EINVAL; @@ -70,7 +70,7 @@ static ssize_t vfio_mdev_read(void *device_data, char __user *buf, size_t count, loff_t *ppos) { struct mdev_device *mdev = device_data; - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->read)) return -EINVAL; @@ -82,7 +82,7 @@ static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, size_t count, loff_t *ppos) { struct mdev_device *mdev = device_data; - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->write)) return -EINVAL; @@ -93,7 +93,7 @@ static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) { struct mdev_device *mdev = device_data; - struct parent_device *parent = mdev->parent; + struct mdev_parent *parent = mdev->parent; if (unlikely(!parent->ops->mmap)) return -EINVAL; -- cgit v1.2.3 From 9372e6feaafb65d88f667ffb5b7b425f8568344f Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:41 -0700 Subject: vfio-mdev: Make mdev_parent private Rather than hoping for good behavior by marking some elements internal, enforce it by making the entire structure private and creating an accessor function for the one useful external field. Cc: Zhenyu Wang Cc: Zhi Wang Cc: Jike Song Signed-off-by: Alex Williamson Reviewed by: Kirti Wankhede --- drivers/vfio/mdev/mdev_core.c | 6 ++++++ drivers/vfio/mdev/mdev_private.h | 10 ++++++++++ 2 files changed, 16 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index bf3b3b0b3d2b..30d05304241e 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -30,6 +30,12 @@ static struct class_compat *mdev_bus_compat_class; static LIST_HEAD(mdev_list); static DEFINE_MUTEX(mdev_list_lock); +struct device *mdev_parent_dev(struct mdev_device *mdev) +{ + return mdev->parent->dev; +} +EXPORT_SYMBOL(mdev_parent_dev); + static int _find_mdev_device(struct device *dev, void *data) { struct mdev_device *mdev; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index 0b72c2d9ee40..b05dd22fc9a6 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -16,6 +16,16 @@ int mdev_bus_register(void); void mdev_bus_unregister(void); +struct mdev_parent { + struct device *dev; + const struct mdev_parent_ops *ops; + struct kref ref; + struct mutex lock; + struct list_head next; + struct kset *mdev_types_kset; + struct list_head type_list; +}; + struct mdev_type { struct kobject kobj; struct kobject *devices_kobj; -- cgit v1.2.3 From 99e3123e3d72616a829dad6d25aa005ef1ef9b13 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 30 Dec 2016 08:13:44 -0700 Subject: vfio-mdev: Make mdev_device private and abstract interfaces Abstract access to mdev_device so that we can define which interfaces are public rather than relying on comments in the structure. Cc: Zhenyu Wang Cc: Zhi Wang Signed-off-by: Alex Williamson Reviewed-by: Jike Song Reviewed by: Kirti Wankhede --- drivers/vfio/mdev/mdev_core.c | 30 ++++++++++++++++++++++++++++++ drivers/vfio/mdev/mdev_private.h | 13 +++++++++++++ 2 files changed, 43 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c index 30d05304241e..36d75c367d22 100644 --- a/drivers/vfio/mdev/mdev_core.c +++ b/drivers/vfio/mdev/mdev_core.c @@ -36,6 +36,36 @@ struct device *mdev_parent_dev(struct mdev_device *mdev) } EXPORT_SYMBOL(mdev_parent_dev); +void *mdev_get_drvdata(struct mdev_device *mdev) +{ + return mdev->driver_data; +} +EXPORT_SYMBOL(mdev_get_drvdata); + +void mdev_set_drvdata(struct mdev_device *mdev, void *data) +{ + mdev->driver_data = data; +} +EXPORT_SYMBOL(mdev_set_drvdata); + +struct device *mdev_dev(struct mdev_device *mdev) +{ + return &mdev->dev; +} +EXPORT_SYMBOL(mdev_dev); + +struct mdev_device *mdev_from_dev(struct device *dev) +{ + return dev_is_mdev(dev) ? to_mdev_device(dev) : NULL; +} +EXPORT_SYMBOL(mdev_from_dev); + +uuid_le mdev_uuid(struct mdev_device *mdev) +{ + return mdev->uuid; +} +EXPORT_SYMBOL(mdev_uuid); + static int _find_mdev_device(struct device *dev, void *data) { struct mdev_device *mdev; diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h index b05dd22fc9a6..a9cefd70a705 100644 --- a/drivers/vfio/mdev/mdev_private.h +++ b/drivers/vfio/mdev/mdev_private.h @@ -26,6 +26,19 @@ struct mdev_parent { struct list_head type_list; }; +struct mdev_device { + struct device dev; + struct mdev_parent *parent; + uuid_le uuid; + void *driver_data; + struct kref ref; + struct list_head next; + struct kobject *type_kobj; +}; + +#define to_mdev_device(dev) container_of(dev, struct mdev_device, dev) +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type) + struct mdev_type { struct kobject kobj; struct kobject *devices_kobj; -- cgit v1.2.3 From 45e869714489431625c569d21fc952428d761476 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 30 Dec 2016 08:13:47 -0700 Subject: vfio-pci: use 32-bit comparisons for register address for gcc-4.5 Using ancient compilers (gcc-4.5 or older) on ARM, we get a link failure with the vfio-pci driver: ERROR: "__aeabi_lcmp" [drivers/vfio/pci/vfio-pci.ko] undefined! The reason is that the compiler tries to do a comparison of a 64-bit range. This changes it to convert to a 32-bit number explicitly first, as newer compilers do for themselves. Signed-off-by: Arnd Bergmann Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_rdwr.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c index 5ffd1d9ad4bd..357243d76f10 100644 --- a/drivers/vfio/pci/vfio_pci_rdwr.c +++ b/drivers/vfio/pci/vfio_pci_rdwr.c @@ -193,7 +193,10 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, char __user *buf, if (!vdev->has_vga) return -EINVAL; - switch (pos) { + if (pos > 0xbfffful) + return -EINVAL; + + switch ((u32)pos) { case 0xa0000 ... 0xbffff: count = min(count, (size_t)(0xc0000 - pos)); iomem = ioremap_nocache(0xa0000, 0xbffff - 0xa0000 + 1); -- cgit v1.2.3 From e19f32da5ded958238eac1bbe001192acef191a2 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 3 Jan 2017 17:26:46 +0530 Subject: vfio-pci: Handle error from pci_iomap Here, pci_iomap can fail, handle this case release selected pci regions and return -ENOMEM. Signed-off-by: Arvind Yadav Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index dcd7c2a99618..324c52e3a1a4 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -1142,6 +1142,10 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return ret; vdev->barmap[index] = pci_iomap(pdev, index, 0); + if (!vdev->barmap[index]) { + pci_release_selected_regions(pdev, 1 << index); + return -ENOMEM; + } } vma->vm_private_data = vdev; -- cgit v1.2.3 From d1b333d12cde9cabe898160b6be9769d3382d81c Mon Sep 17 00:00:00 2001 From: Jike Song Date: Thu, 12 Jan 2017 16:52:03 +0800 Subject: vfio iommu type1: fix the testing of capability for remote task Before the mdev enhancement type1 iommu used capable() to test the capability of current task; in the course of mdev development a new requirement, testing for another task other than current, was raised. ns_capable() was used for this purpose, however it still tests current, the only difference is, in a specified namespace. Fix it by using has_capability() instead, which tests the cap for specified task in init_user_ns, the same namespace as capable(). Cc: Gerd Hoffmann Signed-off-by: Jike Song Reviewed-by: James Morris Reviewed-by: Kirti Wankhede Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9266271a787a..77373e51b283 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -495,8 +495,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, unsigned long *pfn_base, bool do_accounting) { unsigned long limit; - bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, - CAP_IPC_LOCK); + bool lock_cap = has_capability(dma->task, CAP_IPC_LOCK); struct mm_struct *mm; int ret; bool rsvd; -- cgit v1.2.3 From 94a6fa899d2cb5ee76933406df32996576a562e4 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 12 Jan 2017 08:24:16 -0700 Subject: vfio/type1: Remove pid_namespace.h include Using has_capability() rather than ns_capable(), we're no longer using this header. Cc: Jike Song Cc: Kirti Wankhede Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_type1.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 77373e51b283..b3cc33fa6d26 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3 From 5d704992189fe8683a23f583a2f4f137a9b9d28b Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Jan 2017 20:58:01 +0000 Subject: vfio/type1: Allow transparent MSI IOVA allocation When attaching a group to the container, check the group's reserved regions and test whether the IOMMU translates MSI transactions. If yes, we initialize an IOVA allocator through the iommu_get_msi_cookie API. This will allow the MSI IOVAs to be transparently allocated on MSI controller's compose(). Signed-off-by: Eric Auger Acked-by: Alex Williamson Reviewed-by: Tomasz Nowicki Tested-by: Tomasz Nowicki Tested-by: Bharat Bhushan Signed-off-by: Will Deacon --- drivers/vfio/vfio_iommu_type1.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9266271a787a..5651fafe8cfb 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -39,6 +39,7 @@ #include #include #include +#include #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson " @@ -1181,6 +1182,28 @@ static struct vfio_group *find_iommu_group(struct vfio_domain *domain, return NULL; } +static bool vfio_iommu_has_resv_msi(struct iommu_group *group, + phys_addr_t *base) +{ + struct list_head group_resv_regions; + struct iommu_resv_region *region, *next; + bool ret = false; + + INIT_LIST_HEAD(&group_resv_regions); + iommu_get_group_resv_regions(group, &group_resv_regions); + list_for_each_entry(region, &group_resv_regions, list) { + if (region->type & IOMMU_RESV_MSI) { + *base = region->start; + ret = true; + goto out; + } + } +out: + list_for_each_entry_safe(region, next, &group_resv_regions, list) + kfree(region); + return ret; +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -1189,6 +1212,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, struct vfio_domain *domain, *d; struct bus_type *bus = NULL, *mdev_bus; int ret; + bool resv_msi; + phys_addr_t resv_msi_base; mutex_lock(&iommu->lock); @@ -1258,6 +1283,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_domain; + resv_msi = vfio_iommu_has_resv_msi(iommu_group, &resv_msi_base); + INIT_LIST_HEAD(&domain->group_list); list_add(&group->next, &domain->group_list); @@ -1304,6 +1331,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_detach; + if (resv_msi && iommu_get_msi_cookie(domain->domain, resv_msi_base)) + goto out_detach; + list_add(&domain->next, &iommu->domain_list); mutex_unlock(&iommu->lock); -- cgit v1.2.3 From 9d72f87babf144ff3ca5d85655c710de05110038 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Thu, 19 Jan 2017 20:58:02 +0000 Subject: vfio/type1: Check MSI remapping at irq domain level In case the IOMMU translates MSI transactions (typical case on ARM), we check MSI remapping capability at IRQ domain level. Otherwise it is checked at IOMMU level. At this stage the arm-smmu-(v3) still advertise the IOMMU_CAP_INTR_REMAP capability at IOMMU level. This will be removed in subsequent patches. Signed-off-by: Eric Auger Acked-by: Alex Williamson Reviewed-by: Tomasz Nowicki Tested-by: Tomasz Nowicki Tested-by: Bharat Bhushan Signed-off-by: Will Deacon --- drivers/vfio/vfio_iommu_type1.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5651fafe8cfb..ec903a005fae 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -40,6 +40,7 @@ #include #include #include +#include #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson " @@ -1212,7 +1213,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, struct vfio_domain *domain, *d; struct bus_type *bus = NULL, *mdev_bus; int ret; - bool resv_msi; + bool resv_msi, msi_remap; phys_addr_t resv_msi_base; mutex_lock(&iommu->lock); @@ -1288,8 +1289,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, INIT_LIST_HEAD(&domain->group_list); list_add(&group->next, &domain->group_list); - if (!allow_unsafe_interrupts && - !iommu_capable(bus, IOMMU_CAP_INTR_REMAP)) { + msi_remap = resv_msi ? irq_domain_check_msi_remap() : + iommu_capable(bus, IOMMU_CAP_INTR_REMAP); + + if (!allow_unsafe_interrupts && !msi_remap) { pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n", __func__); ret = -EPERM; -- cgit v1.2.3 From bd00fdf198e2da475a2f4265a83686ab42d998a8 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Tue, 24 Jan 2017 17:50:26 +0100 Subject: vfio/spapr: fail tce_iommu_attach_group() when iommu_data is null The recently added mediated VFIO driver doesn't know about powerpc iommu. It thus doesn't register a struct iommu_table_group in the iommu group upon device creation. The iommu_data pointer hence remains null. This causes a kernel oops when userspace tries to set the iommu type of a container associated with a mediated device to VFIO_SPAPR_TCE_v2_IOMMU. [ 82.585440] mtty mtty: MDEV: Registered [ 87.655522] iommu: Adding device 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001 to group 10 [ 87.655527] vfio_mdev 83b8f4f2-509f-382f-3c1e-e6bfe0fa1001: MDEV: group_id = 10 [ 116.297184] Unable to handle kernel paging request for data at address 0x00000030 [ 116.297389] Faulting instruction address: 0xd000000007870524 [ 116.297465] Oops: Kernel access of bad area, sig: 11 [#1] [ 116.297611] SMP NR_CPUS=2048 [ 116.297611] NUMA [ 116.297627] PowerNV ... [ 116.297954] CPU: 33 PID: 7067 Comm: qemu-system-ppc Not tainted 4.10.0-rc5-mdev-test #8 [ 116.297993] task: c000000e7718b680 task.stack: c000000e77214000 [ 116.298025] NIP: d000000007870524 LR: d000000007870518 CTR: 0000000000000000 [ 116.298064] REGS: c000000e77217990 TRAP: 0300 Not tainted (4.10.0-rc5-mdev-test) [ 116.298103] MSR: 9000000000009033 [ 116.298107] CR: 84004444 XER: 00000000 [ 116.298154] CFAR: c00000000000888c DAR: 0000000000000030 DSISR: 40000000 SOFTE: 1 GPR00: d000000007870518 c000000e77217c10 d00000000787b0ed c000000eed2103c0 GPR04: 0000000000000000 0000000000000000 c000000eed2103e0 0000000f24320000 GPR08: 0000000000000104 0000000000000001 0000000000000000 d0000000078729b0 GPR12: c00000000025b7e0 c00000000fe08400 0000000000000001 000001002d31d100 GPR16: 000001002c22c850 00003ffff315c750 0000000043145680 0000000043141bc0 GPR20: ffffffffffffffed fffffffffffff000 0000000020003b65 d000000007706018 GPR24: c000000f16cf0d98 d000000007706000 c000000003f42980 c000000003f42980 GPR28: c000000f1575ac00 c000000003f429c8 0000000000000000 c000000eed2103c0 [ 116.298504] NIP [d000000007870524] tce_iommu_attach_group+0x10c/0x360 [vfio_iommu_spapr_tce] [ 116.298555] LR [d000000007870518] tce_iommu_attach_group+0x100/0x360 [vfio_iommu_spapr_tce] [ 116.298601] Call Trace: [ 116.298610] [c000000e77217c10] [d000000007870518] tce_iommu_attach_group+0x100/0x360 [vfio_iommu_spapr_tce] (unreliable) [ 116.298671] [c000000e77217cb0] [d0000000077033a0] vfio_fops_unl_ioctl+0x278/0x3e0 [vfio] [ 116.298713] [c000000e77217d40] [c0000000002a3ebc] do_vfs_ioctl+0xcc/0x8b0 [ 116.298745] [c000000e77217de0] [c0000000002a4700] SyS_ioctl+0x60/0xc0 [ 116.298782] [c000000e77217e30] [c00000000000b220] system_call+0x38/0xfc [ 116.298812] Instruction dump: [ 116.298828] 7d3f4b78 409effc8 3d220000 e9298020 3c800140 38a00018 608480c0 e8690028 [ 116.298869] 4800249d e8410018 7c7f1b79 41820230 2fa90000 419e0114 e9090020 [ 116.298914] ---[ end trace 1e10b0ced08b9120 ]--- This patch fixes the oops. Reported-by: Vaibhav Jain Signed-off-by: Greg Kurz Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index c8823578a1b2..128d10282d16 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1270,6 +1270,10 @@ static int tce_iommu_attach_group(void *iommu_data, /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n", iommu_group_id(iommu_group), iommu_group); */ table_group = iommu_group_get_iommudata(iommu_group); + if (!table_group) { + ret = -ENODEV; + goto unlock_exit; + } if (tce_groups_attached(container) && (!table_group->ops || !table_group->ops->take_ownership || -- cgit v1.2.3 From 2da64d20a0b20046d688e44f4033efd09157e29d Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Wed, 1 Feb 2017 14:26:16 +1100 Subject: vfio/spapr: Fix missing mutex unlock when creating a window Commit d9c728949ddc ("vfio/spapr: Postpone default window creation") added an additional exit to the VFIO_IOMMU_SPAPR_TCE_CREATE case and made it possible to return from tce_iommu_ioctl() without unlocking container->lock; this fixes the issue. Fixes: d9c728949ddc ("vfio/spapr: Postpone default window creation") Signed-off-by: Alexey Kardashevskiy Reviewed-by: David Gibson Signed-off-by: Alex Williamson --- drivers/vfio/vfio_iommu_spapr_tce.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index 128d10282d16..7690e5bf3cf1 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -1123,12 +1123,11 @@ static long tce_iommu_ioctl(void *iommu_data, mutex_lock(&container->lock); ret = tce_iommu_create_default_window(container); - if (ret) - return ret; - - ret = tce_iommu_create_window(container, create.page_shift, - create.window_size, create.levels, - &create.start_addr); + if (!ret) + ret = tce_iommu_create_window(container, + create.page_shift, + create.window_size, create.levels, + &create.start_addr); mutex_unlock(&container->lock); -- cgit v1.2.3 From 2c9f1af528a4581e8ef8590108daa3c3df08dd5a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 9 Feb 2017 16:01:58 +0000 Subject: vfio/type1: Fix error return code in vfio_iommu_type1_attach_group() Fix to return a negative error code from the error handling case instead of 0, as done elsewhere in this function. Fixes: 5d704992189f ("vfio/type1: Allow transparent MSI IOVA allocation") Signed-off-by: Wei Yongjun Reviewed-by: Eric Auger Acked-by: Alex Williamson Signed-off-by: Joerg Roedel --- drivers/vfio/vfio_iommu_type1.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/vfio') diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 0f353f519574..bd6f293c4ebd 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -1332,8 +1332,11 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_detach; - if (resv_msi && iommu_get_msi_cookie(domain->domain, resv_msi_base)) - goto out_detach; + if (resv_msi) { + ret = iommu_get_msi_cookie(domain->domain, resv_msi_base); + if (ret) + goto out_detach; + } list_add(&domain->next, &iommu->domain_list); -- cgit v1.2.3