diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 11:08:21 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-01-10 11:08:21 -0800 |
commit | 1c8106528aa6bf16b3f457de80df1cf7462a49a4 (patch) | |
tree | 4aed009c4a36195fd14c9f8d70fe2723a49583da /drivers/iommu | |
parent | 1a464cbb3d483f2f195b614cffa4aa1b910a0440 (diff) | |
parent | f93ea733878733f3e98475bc3e2ccf789bebcfb8 (diff) | |
download | linux-1c8106528aa6bf16b3f457de80df1cf7462a49a4.tar.gz linux-1c8106528aa6bf16b3f457de80df1cf7462a49a4.tar.bz2 linux-1c8106528aa6bf16b3f457de80df1cf7462a49a4.zip |
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu
* 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu: (53 commits)
iommu/amd: Set IOTLB invalidation timeout
iommu/amd: Init stats for iommu=pt
iommu/amd: Remove unnecessary cache flushes in amd_iommu_resume
iommu/amd: Add invalidate-context call-back
iommu/amd: Add amd_iommu_device_info() function
iommu/amd: Adapt IOMMU driver to PCI register name changes
iommu/amd: Add invalid_ppr callback
iommu/amd: Implement notifiers for IOMMUv2
iommu/amd: Implement IO page-fault handler
iommu/amd: Add routines to bind/unbind a pasid
iommu/amd: Implement device aquisition code for IOMMUv2
iommu/amd: Add driver stub for AMD IOMMUv2 support
iommu/amd: Add stat counter for IOMMUv2 events
iommu/amd: Add device errata handling
iommu/amd: Add function to get IOMMUv2 domain for pdev
iommu/amd: Implement function to send PPR completions
iommu/amd: Implement functions to manage GCR3 table
iommu/amd: Implement IOMMUv2 TLB flushing routines
iommu/amd: Add support for IOMMUv2 domain mode
iommu/amd: Add amd_iommu_domain_direct_map function
...
Diffstat (limited to 'drivers/iommu')
-rw-r--r-- | drivers/iommu/Kconfig | 13 | ||||
-rw-r--r-- | drivers/iommu/Makefile | 1 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu.c | 883 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_init.c | 133 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_proto.h | 24 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_types.h | 118 | ||||
-rw-r--r-- | drivers/iommu/amd_iommu_v2.c | 994 | ||||
-rw-r--r-- | drivers/iommu/intel-iommu.c | 79 | ||||
-rw-r--r-- | drivers/iommu/iommu.c | 177 | ||||
-rw-r--r-- | drivers/iommu/msm_iommu.c | 25 | ||||
-rw-r--r-- | drivers/iommu/omap-iommu.c | 80 | ||||
-rw-r--r-- | drivers/iommu/omap-iovmm.c | 48 |
12 files changed, 2424 insertions, 151 deletions
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 5414253b185a..6bea6962f8ee 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -34,7 +34,9 @@ config AMD_IOMMU bool "AMD IOMMU support" select SWIOTLB select PCI_MSI - select PCI_IOV + select PCI_ATS + select PCI_PRI + select PCI_PASID select IOMMU_API depends on X86_64 && PCI && ACPI ---help--- @@ -58,6 +60,15 @@ config AMD_IOMMU_STATS information to userspace via debugfs. If unsure, say N. +config AMD_IOMMU_V2 + tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)" + depends on AMD_IOMMU && PROFILING && EXPERIMENTAL + select MMU_NOTIFIER + ---help--- + This option enables support for the AMD IOMMUv2 features of the IOMMU + hardware. Select this option if you want to use devices that support + the the PCI PRI and PASID interface. + # Intel IOMMU support config DMAR_TABLE bool diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index 2f4448794bc7..0e36b4934aff 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o +obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o obj-$(CONFIG_DMAR_TABLE) += dmar.o obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o obj-$(CONFIG_IRQ_REMAP) += intr_remapping.o diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 4ee277a8521a..cce1f03b8895 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -17,6 +17,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/ratelimit.h> #include <linux/pci.h> #include <linux/pci-ats.h> #include <linux/bitmap.h> @@ -28,6 +29,8 @@ #include <linux/iommu.h> #include <linux/delay.h> #include <linux/amd-iommu.h> +#include <linux/notifier.h> +#include <linux/export.h> #include <asm/msidef.h> #include <asm/proto.h> #include <asm/iommu.h> @@ -41,6 +44,24 @@ #define LOOP_TIMEOUT 100000 +/* + * This bitmap is used to advertise the page sizes our hardware support + * to the IOMMU core, which will then use this information to split + * physically contiguous memory regions it is mapping into page sizes + * that we support. + * + * Traditionally the IOMMU core just handed us the mappings directly, + * after making sure the size is an order of a 4KiB page and that the + * mapping has natural alignment. + * + * To retain this behavior, we currently advertise that we support + * all page sizes that are an order of 4KiB. + * + * If at some point we'd like to utilize the IOMMU core's new behavior, + * we could change this to advertise the real page sizes we support. + */ +#define AMD_IOMMU_PGSIZES (~0xFFFUL) + static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* A list of preallocated protection domains */ @@ -59,6 +80,9 @@ static struct protection_domain *pt_domain; static struct iommu_ops amd_iommu_ops; +static ATOMIC_NOTIFIER_HEAD(ppr_notifier); +int amd_iommu_max_glx_val = -1; + /* * general struct to manage commands send to an IOMMU */ @@ -67,6 +91,7 @@ struct iommu_cmd { }; static void update_domain(struct protection_domain *domain); +static int __init alloc_passthrough_domain(void); /**************************************************************************** * @@ -147,6 +172,33 @@ static struct iommu_dev_data *get_dev_data(struct device *dev) return dev->archdata.iommu; } +static bool pci_iommuv2_capable(struct pci_dev *pdev) +{ + static const int caps[] = { + PCI_EXT_CAP_ID_ATS, + PCI_EXT_CAP_ID_PRI, + PCI_EXT_CAP_ID_PASID, + }; + int i, pos; + + for (i = 0; i < 3; ++i) { + pos = pci_find_ext_capability(pdev, caps[i]); + if (pos == 0) + return false; + } + + return true; +} + +static bool pdev_pri_erratum(struct pci_dev *pdev, u32 erratum) +{ + struct iommu_dev_data *dev_data; + + dev_data = get_dev_data(&pdev->dev); + + return dev_data->errata & (1 << erratum) ? true : false; +} + /* * In this function the list of preallocated protection domains is traversed to * find the domain for a specific device @@ -204,6 +256,7 @@ static bool check_device(struct device *dev) static int iommu_init_device(struct device *dev) { + struct pci_dev *pdev = to_pci_dev(dev); struct iommu_dev_data *dev_data; u16 alias; @@ -228,6 +281,13 @@ static int iommu_init_device(struct device *dev) dev_data->alias_data = alias_data; } + if (pci_iommuv2_capable(pdev)) { + struct amd_iommu *iommu; + + iommu = amd_iommu_rlookup_table[dev_data->devid]; + dev_data->iommu_v2 = iommu->is_iommu_v2; + } + dev->archdata.iommu = dev_data; return 0; @@ -317,6 +377,11 @@ DECLARE_STATS_COUNTER(domain_flush_single); DECLARE_STATS_COUNTER(domain_flush_all); DECLARE_STATS_COUNTER(alloced_io_mem); DECLARE_STATS_COUNTER(total_map_requests); +DECLARE_STATS_COUNTER(complete_ppr); +DECLARE_STATS_COUNTER(invalidate_iotlb); +DECLARE_STATS_COUNTER(invalidate_iotlb_all); +DECLARE_STATS_COUNTER(pri_requests); + static struct dentry *stats_dir; static struct dentry *de_fflush; @@ -351,6 +416,10 @@ static void amd_iommu_stats_init(void) amd_iommu_stats_add(&domain_flush_all); amd_iommu_stats_add(&alloced_io_mem); amd_iommu_stats_add(&total_map_requests); + amd_iommu_stats_add(&complete_ppr); + amd_iommu_stats_add(&invalidate_iotlb); + amd_iommu_stats_add(&invalidate_iotlb_all); + amd_iommu_stats_add(&pri_requests); } #endif @@ -365,8 +434,8 @@ static void dump_dte_entry(u16 devid) { int i; - for (i = 0; i < 8; ++i) - pr_err("AMD-Vi: DTE[%d]: %08x\n", i, + for (i = 0; i < 4; ++i) + pr_err("AMD-Vi: DTE[%d]: %016llx\n", i, amd_iommu_dev_table[devid].data[i]); } @@ -461,12 +530,84 @@ static void iommu_poll_events(struct amd_iommu *iommu) spin_unlock_irqrestore(&iommu->lock, flags); } +static void iommu_handle_ppr_entry(struct amd_iommu *iommu, u32 head) +{ + struct amd_iommu_fault fault; + volatile u64 *raw; + int i; + + INC_STATS_COUNTER(pri_requests); + + raw = (u64 *)(iommu->ppr_log + head); + + /* + * Hardware bug: Interrupt may arrive before the entry is written to + * memory. If this happens we need to wait for the entry to arrive. + */ + for (i = 0; i < LOOP_TIMEOUT; ++i) { + if (PPR_REQ_TYPE(raw[0]) != 0) + break; + udelay(1); + } + + if (PPR_REQ_TYPE(raw[0]) != PPR_REQ_FAULT) { + pr_err_ratelimited("AMD-Vi: Unknown PPR request received\n"); + return; + } + + fault.address = raw[1]; + fault.pasid = PPR_PASID(raw[0]); + fault.device_id = PPR_DEVID(raw[0]); + fault.tag = PPR_TAG(raw[0]); + fault.flags = PPR_FLAGS(raw[0]); + + /* + * To detect the hardware bug we need to clear the entry + * to back to zero. + */ + raw[0] = raw[1] = 0; + + atomic_notifier_call_chain(&ppr_notifier, 0, &fault); +} + +static void iommu_poll_ppr_log(struct amd_iommu *iommu) +{ + unsigned long flags; + u32 head, tail; + + if (iommu->ppr_log == NULL) + return; + + spin_lock_irqsave(&iommu->lock, flags); + + head = readl(iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + + while (head != tail) { + + /* Handle PPR entry */ + iommu_handle_ppr_entry(iommu, head); + + /* Update and refresh ring-buffer state*/ + head = (head + PPR_ENTRY_SIZE) % PPR_LOG_SIZE; + writel(head, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + } + + /* enable ppr interrupts again */ + writel(MMIO_STATUS_PPR_INT_MASK, iommu->mmio_base + MMIO_STATUS_OFFSET); + + spin_unlock_irqrestore(&iommu->lock, flags); +} + irqreturn_t amd_iommu_int_thread(int irq, void *data) { struct amd_iommu *iommu; - for_each_iommu(iommu) + for_each_iommu(iommu) { iommu_poll_events(iommu); + iommu_poll_ppr_log(iommu); + } return IRQ_HANDLED; } @@ -595,6 +736,60 @@ static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep, cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; } +static void build_inv_iommu_pasid(struct iommu_cmd *cmd, u16 domid, int pasid, + u64 address, bool size) +{ + memset(cmd, 0, sizeof(*cmd)); + + address &= ~(0xfffULL); + + cmd->data[0] = pasid & PASID_MASK; + cmd->data[1] = domid; + cmd->data[2] = lower_32_bits(address); + cmd->data[3] = upper_32_bits(address); + cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + if (size) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES); +} + +static void build_inv_iotlb_pasid(struct iommu_cmd *cmd, u16 devid, int pasid, + int qdep, u64 address, bool size) +{ + memset(cmd, 0, sizeof(*cmd)); + + address &= ~(0xfffULL); + + cmd->data[0] = devid; + cmd->data[0] |= (pasid & 0xff) << 16; + cmd->data[0] |= (qdep & 0xff) << 24; + cmd->data[1] = devid; + cmd->data[1] |= ((pasid >> 8) & 0xfff) << 16; + cmd->data[2] = lower_32_bits(address); + cmd->data[2] |= CMD_INV_IOMMU_PAGES_GN_MASK; + cmd->data[3] = upper_32_bits(address); + if (size) + cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; + CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES); +} + +static void build_complete_ppr(struct iommu_cmd *cmd, u16 devid, int pasid, + int status, int tag, bool gn) +{ + memset(cmd, 0, sizeof(*cmd)); + + cmd->data[0] = devid; + if (gn) { + cmd->data[1] = pasid & PASID_MASK; + cmd->data[2] = CMD_INV_IOMMU_PAGES_GN_MASK; + } + cmd->data[3] = tag & 0x1ff; + cmd->data[3] |= (status & PPR_STATUS_MASK) << PPR_STATUS_SHIFT; + + CMD_SET_TYPE(cmd, CMD_COMPLETE_PPR); +} + static void build_inv_all(struct iommu_cmd *cmd) { memset(cmd, 0, sizeof(*cmd)); @@ -1496,6 +1691,48 @@ static void free_pagetable(struct protection_domain *domain) domain->pt_root = NULL; } +static void free_gcr3_tbl_level1(u64 *tbl) +{ + u64 *ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (!(tbl[i] & GCR3_VALID)) + continue; + + ptr = __va(tbl[i] & PAGE_MASK); + + free_page((unsigned long)ptr); + } +} + +static void free_gcr3_tbl_level2(u64 *tbl) +{ + u64 *ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (!(tbl[i] & GCR3_VALID)) + continue; + + ptr = __va(tbl[i] & PAGE_MASK); + + free_gcr3_tbl_level1(ptr); + } +} + +static void free_gcr3_table(struct protection_domain *domain) +{ + if (domain->glx == 2) + free_gcr3_tbl_level2(domain->gcr3_tbl); + else if (domain->glx == 1) + free_gcr3_tbl_level1(domain->gcr3_tbl); + else if (domain->glx != 0) + BUG(); + + free_page((unsigned long)domain->gcr3_tbl); +} + /* * Free a domain, only used if something went wrong in the * allocation path and we need to free an already allocated page table @@ -1582,20 +1819,52 @@ static bool dma_ops_domain(struct protection_domain *domain) static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats) { - u64 pte_root = virt_to_phys(domain->pt_root); - u32 flags = 0; + u64 pte_root = 0; + u64 flags = 0; + + if (domain->mode != PAGE_MODE_NONE) + pte_root = virt_to_phys(domain->pt_root); pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) << DEV_ENTRY_MODE_SHIFT; pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; + flags = amd_iommu_dev_table[devid].data[1]; + if (ats) flags |= DTE_FLAG_IOTLB; - amd_iommu_dev_table[devid].data[3] |= flags; - amd_iommu_dev_table[devid].data[2] = domain->id; - amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); - amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + if (domain->flags & PD_IOMMUV2_MASK) { + u64 gcr3 = __pa(domain->gcr3_tbl); + u64 glx = domain->glx; + u64 tmp; + + pte_root |= DTE_FLAG_GV; + pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT; + + /* First mask out possible old values for GCR3 table */ + tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B; + flags &= ~tmp; + + tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C; + flags &= ~tmp; + + /* Encode GCR3 table into DTE */ + tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A; + pte_root |= tmp; + + tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B; + flags |= tmp; + + tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C; + flags |= tmp; + } + + flags &= ~(0xffffUL); + flags |= domain->id; + + amd_iommu_dev_table[devid].data[1] = flags; + amd_iommu_dev_table[devid].data[0] = pte_root; } static void clear_dte_entry(u16 devid) @@ -1603,7 +1872,6 @@ static void clear_dte_entry(u16 devid) /* remove entry from the device table seen by the hardware */ amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV; amd_iommu_dev_table[devid].data[1] = 0; - amd_iommu_dev_table[devid].data[2] = 0; amd_iommu_apply_erratum_63(devid); } @@ -1696,6 +1964,93 @@ out_unlock: return ret; } + +static void pdev_iommuv2_disable(struct pci_dev *pdev) +{ + pci_disable_ats(pdev); + pci_disable_pri(pdev); + pci_disable_pasid(pdev); +} + +/* FIXME: Change generic reset-function to do the same */ +static int pri_reset_while_enabled(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (!pos) + return -EINVAL; + + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + control |= PCI_PRI_CTRL_RESET; + pci_write_config_word(pdev, pos + PCI_PRI_CTRL, control); + + return 0; +} + +static int pdev_iommuv2_enable(struct pci_dev *pdev) +{ + bool reset_enable; + int reqs, ret; + + /* FIXME: Hardcode number of outstanding requests for now */ + reqs = 32; + if (pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_LIMIT_REQ_ONE)) + reqs = 1; + reset_enable = pdev_pri_erratum(pdev, AMD_PRI_DEV_ERRATUM_ENABLE_RESET); + + /* Only allow access to user-accessible pages */ + ret = pci_enable_pasid(pdev, 0); + if (ret) + goto out_err; + + /* First reset the PRI state of the device */ + ret = pci_reset_pri(pdev); + if (ret) + goto out_err; + + /* Enable PRI */ + ret = pci_enable_pri(pdev, reqs); + if (ret) + goto out_err; + + if (reset_enable) { + ret = pri_reset_while_enabled(pdev); + if (ret) + goto out_err; + } + + ret = pci_enable_ats(pdev, PAGE_SHIFT); + if (ret) + goto out_err; + + return 0; + +out_err: + pci_disable_pri(pdev); + pci_disable_pasid(pdev); + + return ret; +} + +/* FIXME: Move this to PCI code */ +#define PCI_PRI_TLP_OFF (1 << 2) + +bool pci_pri_tlp_required(struct pci_dev *pdev) +{ + u16 control; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (!pos) + return false; + + pci_read_config_word(pdev, pos + PCI_PRI_CTRL, &control); + + return (control & PCI_PRI_TLP_OFF) ? true : false; +} + /* * If a device is not yet associated with a domain, this function does * assigns it visible for the hardware @@ -1710,7 +2065,18 @@ static int attach_device(struct device *dev, dev_data = get_dev_data(dev); - if (amd_iommu_iotlb_sup && pci_enable_ats(pdev, PAGE_SHIFT) == 0) { + if (domain->flags & PD_IOMMUV2_MASK) { + if (!dev_data->iommu_v2 || !dev_data->passthrough) + return -EINVAL; + + if (pdev_iommuv2_enable(pdev) != 0) + return -EINVAL; + + dev_data->ats.enabled = true; + dev_data->ats.qdep = pci_ats_queue_depth(pdev); + dev_data->pri_tlp = pci_pri_tlp_required(pdev); + } else if (amd_iommu_iotlb_sup && + pci_enable_ats(pdev, PAGE_SHIFT) == 0) { dev_data->ats.enabled = true; dev_data->ats.qdep = pci_ats_queue_depth(pdev); } @@ -1760,7 +2126,7 @@ static void __detach_device(struct iommu_dev_data *dev_data) * passthrough domain if it is detached from any other domain. * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through && + if (dev_data->passthrough && (dev_data->domain == NULL && domain != pt_domain)) __attach_device(dev_data, pt_domain); } @@ -1770,20 +2136,24 @@ static void __detach_device(struct iommu_dev_data *dev_data) */ static void detach_device(struct device *dev) { + struct protection_domain *domain; struct iommu_dev_data *dev_data; unsigned long flags; dev_data = get_dev_data(dev); + domain = dev_data->domain; /* lock device table */ write_lock_irqsave(&amd_iommu_devtable_lock, flags); __detach_device(dev_data); write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); - if (dev_data->ats.enabled) { + if (domain->flags & PD_IOMMUV2_MASK) + pdev_iommuv2_disable(to_pci_dev(dev)); + else if (dev_data->ats.enabled) pci_disable_ats(to_pci_dev(dev)); - dev_data->ats.enabled = false; - } + + dev_data->ats.enabled = false; } /* @@ -1818,18 +2188,20 @@ static struct protection_domain *domain_for_device(struct device *dev) static int device_change_notifier(struct notifier_block *nb, unsigned long action, void *data) { - struct device *dev = data; - u16 devid; - struct protection_domain *domain; struct dma_ops_domain *dma_domain; + struct protection_domain *domain; + struct iommu_dev_data *dev_data; + struct device *dev = data; struct amd_iommu *iommu; unsigned long flags; + u16 devid; if (!check_device(dev)) return 0; - devid = get_device_id(dev); - iommu = amd_iommu_rlookup_table[devid]; + devid = get_device_id(dev); + iommu = amd_iommu_rlookup_table[devid]; + dev_data = get_dev_data(dev); switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: @@ -1838,7 +2210,7 @@ static int device_change_notifier(struct notifier_block *nb, if (!domain) goto out; - if (iommu_pass_through) + if (dev_data->passthrough) break; detach_device(dev); break; @@ -2434,8 +2806,9 @@ static int amd_iommu_dma_supported(struct device *dev, u64 mask) */ static void prealloc_protection_domains(void) { - struct pci_dev *dev = NULL; + struct iommu_dev_data *dev_data; struct dma_ops_domain *dma_dom; + struct pci_dev *dev = NULL; u16 devid; for_each_pci_dev(dev) { @@ -2444,6 +2817,16 @@ static void prealloc_protection_domains(void) if (!check_device(&dev->dev)) continue; + dev_data = get_dev_data(&dev->dev); + if (!amd_iommu_force_isolation && dev_data->iommu_v2) { + /* Make sure passthrough domain is allocated */ + alloc_passthrough_domain(); + dev_data->passthrough = true; + attach_device(&dev->dev, pt_domain); + pr_info("AMD-Vi: Using passthough domain for device %s\n", + dev_name(&dev->dev)); + } + /* Is there already any domain for it? */ if (domain_for_device(&dev->dev)) continue; @@ -2474,6 +2857,7 @@ static struct dma_map_ops amd_iommu_dma_ops = { static unsigned device_dma_ops_init(void) { + struct iommu_dev_data *dev_data; struct pci_dev *pdev = NULL; unsigned unhandled = 0; @@ -2483,7 +2867,12 @@ static unsigned device_dma_ops_init(void) continue; } - pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + dev_data = get_dev_data(&pdev->dev); + + if (!dev_data->passthrough) + pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + else + pdev->dev.archdata.dma_ops = &nommu_dma_ops; } return unhandled; @@ -2610,6 +2999,20 @@ out_err: return NULL; } +static int __init alloc_passthrough_domain(void) +{ + if (pt_domain != NULL) + return 0; + + /* allocate passthrough domain */ + pt_domain = protection_domain_alloc(); + if (!pt_domain) + return -ENOMEM; + + pt_domain->mode = PAGE_MODE_NONE; + + return 0; +} static int amd_iommu_domain_init(struct iommu_domain *dom) { struct protection_domain *domain; @@ -2623,6 +3026,8 @@ static int amd_iommu_domain_init(struct iommu_domain *dom) if (!domain->pt_root) goto out_free; + domain->iommu_domain = dom; + dom->priv = domain; return 0; @@ -2645,7 +3050,11 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom) BUG_ON(domain->dev_cnt != 0); - free_pagetable(domain); + if (domain->mode != PAGE_MODE_NONE) + free_pagetable(domain); + + if (domain->flags & PD_IOMMUV2_MASK) + free_gcr3_table(domain); protection_domain_free(domain); @@ -2702,13 +3111,15 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, } static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, - phys_addr_t paddr, int gfp_order, int iommu_prot) + phys_addr_t paddr, size_t page_size, int iommu_prot) { - unsigned long page_size = 0x1000UL << gfp_order; struct protection_domain *domain = dom->priv; int prot = 0; int ret; + if (domain->mode == PAGE_MODE_NONE) + return -EINVAL; + if (iommu_prot & IOMMU_READ) prot |= IOMMU_PROT_IR; if (iommu_prot & IOMMU_WRITE) @@ -2721,13 +3132,14 @@ static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, return ret; } -static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, - int gfp_order) +static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, + size_t page_size) { struct protection_domain *domain = dom->priv; - unsigned long page_size, unmap_size; + size_t unmap_size; - page_size = 0x1000UL << gfp_order; + if (domain->mode == PAGE_MODE_NONE) + return -EINVAL; mutex_lock(&domain->api_lock); unmap_size = iommu_unmap_page(domain, iova, page_size); @@ -2735,7 +3147,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, domain_flush_tlb_pde(domain); - return get_order(unmap_size); + return unmap_size; } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, @@ -2746,6 +3158,9 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, phys_addr_t paddr; u64 *pte, __pte; + if (domain->mode == PAGE_MODE_NONE) + return iova; + pte = fetch_pte(domain, iova); if (!pte || !IOMMU_PTE_PRESENT(*pte)) @@ -2773,6 +3188,26 @@ static int amd_iommu_domain_has_cap(struct iommu_domain *domain, return 0; } +static int amd_iommu_device_group(struct device *dev, unsigned int *groupid) +{ + struct iommu_dev_data *dev_data = dev->archdata.iommu; + struct pci_dev *pdev = to_pci_dev(dev); + u16 devid; + + if (!dev_data) + return -ENODEV; + + if (pdev->is_virtfn || !iommu_group_mf) + devid = dev_data->devid; + else + devid = calc_devid(pdev->bus->number, + PCI_DEVFN(PCI_SLOT(pdev->devfn), 0)); + + *groupid = amd_iommu_alias_table[devid]; + + return 0; +} + static struct iommu_ops amd_iommu_ops = { .domain_init = amd_iommu_domain_init, .domain_destroy = amd_iommu_domain_destroy, @@ -2782,6 +3217,8 @@ static struct iommu_ops amd_iommu_ops = { .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, .domain_has_cap = amd_iommu_domain_has_cap, + .device_group = amd_iommu_device_group, + .pgsize_bitmap = AMD_IOMMU_PGSIZES, }; /***************************************************************************** @@ -2796,21 +3233,23 @@ static struct iommu_ops amd_iommu_ops = { int __init amd_iommu_init_passthrough(void) { - struct amd_iommu *iommu; + struct iommu_dev_data *dev_data; struct pci_dev *dev = NULL; + struct amd_iommu *iommu; u16 devid; + int ret; - /* allocate passthrough domain */ - pt_domain = protection_domain_alloc(); - if (!pt_domain) - return -ENOMEM; - - pt_domain->mode |= PAGE_MODE_NONE; + ret = alloc_passthrough_domain(); + if (ret) + return ret; for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; + dev_data = get_dev_data(&dev->dev); + dev_data->passthrough = true; + devid = get_device_id(&dev->dev); iommu = amd_iommu_rlookup_table[devid]; @@ -2820,7 +3259,375 @@ int __init amd_iommu_init_passthrough(void) attach_device(&dev->dev, pt_domain); } + amd_iommu_stats_init(); + pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); return 0; } + +/* IOMMUv2 specific functions */ +int amd_iommu_register_ppr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_register(&ppr_notifier, nb); +} +EXPORT_SYMBOL(amd_iommu_register_ppr_notifier); + +int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&ppr_notifier, nb); +} +EXPORT_SYMBOL(amd_iommu_unregister_ppr_notifier); + +void amd_iommu_domain_direct_map(struct iommu_domain *dom) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + + spin_lock_irqsave(&domain->lock, flags); + + /* Update data structure */ + domain->mode = PAGE_MODE_NONE; + domain->updated = true; + + /* Make changes visible to IOMMUs */ + update_domain(domain); + + /* Page-table is not visible to IOMMU anymore, so free it */ + free_pagetable(domain); + + spin_unlock_irqrestore(&domain->lock, flags); +} +EXPORT_SYMBOL(amd_iommu_domain_direct_map); + +int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int levels, ret; + + if (pasids <= 0 || pasids > (PASID_MASK + 1)) + return -EINVAL; + + /* Number of GCR3 table levels required */ + for (levels = 0; (pasids - 1) & ~0x1ff; pasids >>= 9) + levels += 1; + + if (levels > amd_iommu_max_glx_val) + return -EINVAL; + + spin_lock_irqsave(&domain->lock, flags); + + /* + * Save us all sanity checks whether devices already in the + * domain support IOMMUv2. Just force that the domain has no + * devices attached when it is switched into IOMMUv2 mode. + */ + ret = -EBUSY; + if (domain->dev_cnt > 0 || domain->flags & PD_IOMMUV2_MASK) + goto out; + + ret = -ENOMEM; + domain->gcr3_tbl = (void *)get_zeroed_page(GFP_ATOMIC); + if (domain->gcr3_tbl == NULL) + goto out; + + domain->glx = levels; + domain->flags |= PD_IOMMUV2_MASK; + domain->updated = true; + + update_domain(domain); + + ret = 0; + +out: + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_enable_v2); + +static int __flush_pasid(struct protection_domain *domain, int pasid, + u64 address, bool size) +{ + struct iommu_dev_data *dev_data; + struct iommu_cmd cmd; + int i, ret; + + if (!(domain->flags & PD_IOMMUV2_MASK)) + return -EINVAL; + + build_inv_iommu_pasid(&cmd, domain->id, pasid, address, size); + + /* + * IOMMU TLB needs to be flushed before Device TLB to + * prevent device TLB refill from IOMMU TLB + */ + for (i = 0; i < amd_iommus_present; ++i) { + if (domain->dev_iommu[i] == 0) + continue; + + ret = iommu_queue_command(amd_iommus[i], &cmd); + if (ret != 0) + goto out; + } + + /* Wait until IOMMU TLB flushes are complete */ + domain_flush_complete(domain); + + /* Now flush device TLBs */ + list_for_each_entry(dev_data, &domain->dev_list, list) { + struct amd_iommu *iommu; + int qdep; + + BUG_ON(!dev_data->ats.enabled); + + qdep = dev_data->ats.qdep; + iommu = amd_iommu_rlookup_table[dev_data->devid]; + + build_inv_iotlb_pasid(&cmd, dev_data->devid, pasid, + qdep, address, size); + + ret = iommu_queue_command(iommu, &cmd); + if (ret != 0) + goto out; + } + + /* Wait until all device TLBs are flushed */ + domain_flush_complete(domain); + + ret = 0; + +out: + + return ret; +} + +static int __amd_iommu_flush_page(struct protection_domain *domain, int pasid, + u64 address) +{ + INC_STATS_COUNTER(invalidate_iotlb); + + return __flush_pasid(domain, pasid, address, false); +} + +int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, + u64 address) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __amd_iommu_flush_page(domain, pasid, address); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_flush_page); + +static int __amd_iommu_flush_tlb(struct protection_domain *domain, int pasid) +{ + INC_STATS_COUNTER(invalidate_iotlb_all); + + return __flush_pasid(domain, pasid, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, + true); +} + +int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __amd_iommu_flush_tlb(domain, pasid); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_flush_tlb); + +static u64 *__get_gcr3_pte(u64 *root, int level, int pasid, bool alloc) +{ + int index; + u64 *pte; + + while (true) { + + index = (pasid >> (9 * level)) & 0x1ff; + pte = &root[index]; + + if (level == 0) + break; + + if (!(*pte & GCR3_VALID)) { + if (!alloc) + return NULL; + + root = (void *)get_zeroed_page(GFP_ATOMIC); + if (root == NULL) + return NULL; + + *pte = __pa(root) | GCR3_VALID; + } + + root = __va(*pte & PAGE_MASK); + + level -= 1; + } + + return pte; +} + +static int __set_gcr3(struct protection_domain *domain, int pasid, + unsigned long cr3) +{ + u64 *pte; + + if (domain->mode != PAGE_MODE_NONE) + return -EINVAL; + + pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, true); + if (pte == NULL) + return -ENOMEM; + + *pte = (cr3 & PAGE_MASK) | GCR3_VALID; + + return __amd_iommu_flush_tlb(domain, pasid); +} + +static int __clear_gcr3(struct protection_domain *domain, int pasid) +{ + u64 *pte; + + if (domain->mode != PAGE_MODE_NONE) + return -EINVAL; + + pte = __get_gcr3_pte(domain->gcr3_tbl, domain->glx, pasid, false); + if (pte == NULL) + return 0; + + *pte = 0; + + return __amd_iommu_flush_tlb(domain, pasid); +} + +int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, + unsigned long cr3) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __set_gcr3(domain, pasid, cr3); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_set_gcr3); + +int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid) +{ + struct protection_domain *domain = dom->priv; + unsigned long flags; + int ret; + + spin_lock_irqsave(&domain->lock, flags); + ret = __clear_gcr3(domain, pasid); + spin_unlock_irqrestore(&domain->lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_domain_clear_gcr3); + +int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, + int status, int tag) +{ + struct iommu_dev_data *dev_data; + struct amd_iommu *iommu; + struct iommu_cmd cmd; + + INC_STATS_COUNTER(complete_ppr); + + dev_data = get_dev_data(&pdev->dev); + iommu = amd_iommu_rlookup_table[dev_data->devid]; + + build_complete_ppr(&cmd, dev_data->devid, pasid, status, + tag, dev_data->pri_tlp); + + return iommu_queue_command(iommu, &cmd); +} +EXPORT_SYMBOL(amd_iommu_complete_ppr); + +struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev) +{ + struct protection_domain *domain; + + domain = get_domain(&pdev->dev); + if (IS_ERR(domain)) + return NULL; + + /* Only return IOMMUv2 domains */ + if (!(domain->flags & PD_IOMMUV2_MASK)) + return NULL; + + return domain->iommu_domain; +} +EXPORT_SYMBOL(amd_iommu_get_v2_domain); + +void amd_iommu_enable_device_erratum(struct pci_dev *pdev, u32 erratum) +{ + struct iommu_dev_data *dev_data; + + if (!amd_iommu_v2_supported()) + return; + + dev_data = get_dev_data(&pdev->dev); + dev_data->errata |= (1 << erratum); +} +EXPORT_SYMBOL(amd_iommu_enable_device_erratum); + +int amd_iommu_device_info(struct pci_dev *pdev, + struct amd_iommu_device_info *info) +{ + int max_pasids; + int pos; + + if (pdev == NULL || info == NULL) + return -EINVAL; + + if (!amd_iommu_v2_supported()) + return -EINVAL; + + memset(info, 0, sizeof(*info)); + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ATS); + if (pos) + info->flags |= AMD_IOMMU_DEVICE_FLAG_ATS_SUP; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (pos) + info->flags |= AMD_IOMMU_DEVICE_FLAG_PRI_SUP; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PASID); + if (pos) { + int features; + + max_pasids = 1 << (9 * (amd_iommu_max_glx_val + 1)); + max_pasids = min(max_pasids, (1 << 20)); + + info->flags |= AMD_IOMMU_DEVICE_FLAG_PASID_SUP; + info->max_pasids = min(pci_max_pasids(pdev), max_pasids); + + features = pci_pasid_features(pdev); + if (features & PCI_PASID_CAP_EXEC) + info->flags |= AMD_IOMMU_DEVICE_FLAG_EXEC_SUP; + if (features & PCI_PASID_CAP_PRIV) + info->flags |= AMD_IOMMU_DEVICE_FLAG_PRIV_SUP; + } + + return 0; +} +EXPORT_SYMBOL(amd_iommu_device_info); diff --git a/drivers/iommu/amd_iommu_init.c b/drivers/iommu/amd_iommu_init.c index 82d2410f4205..bdea288dc185 100644 --- a/drivers/iommu/amd_iommu_init.c +++ b/drivers/iommu/amd_iommu_init.c @@ -25,6 +25,7 @@ #include <linux/interrupt.h> #include <linux/msi.h> #include <linux/amd-iommu.h> +#include <linux/export.h> #include <asm/pci-direct.h> #include <asm/iommu.h> #include <asm/gart.h> @@ -141,6 +142,12 @@ int amd_iommus_present; bool amd_iommu_np_cache __read_mostly; bool amd_iommu_iotlb_sup __read_mostly = true; +u32 amd_iommu_max_pasids __read_mostly = ~0; + +bool amd_iommu_v2_present __read_mostly; + +bool amd_iommu_force_isolation __read_mostly; + /* * The ACPI table parsing functions set this variable on an error */ @@ -299,6 +306,16 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +static void iommu_set_inv_tlb_timeout(struct amd_iommu *iommu, int timeout) +{ + u32 ctrl; + + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl &= ~CTRL_INV_TO_MASK; + ctrl |= (timeout << CONTROL_INV_TIMEOUT) & CTRL_INV_TO_MASK; + writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); +} + /* Function to enable the hardware */ static void iommu_enable(struct amd_iommu *iommu) { @@ -581,21 +598,69 @@ static void __init free_event_buffer(struct amd_iommu *iommu) free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); } +/* allocates the memory where the IOMMU will log its events to */ +static u8 * __init alloc_ppr_log(struct amd_iommu *iommu) +{ + iommu->ppr_log = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(PPR_LOG_SIZE)); + + if (iommu->ppr_log == NULL) + return NULL; + + return iommu->ppr_log; +} + +static void iommu_enable_ppr_log(struct amd_iommu *iommu) +{ + u64 entry; + + if (iommu->ppr_log == NULL) + return; + + entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; + + memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, + &entry, sizeof(entry)); + + /* set head and tail to zero manually */ + writel(0x00, iommu->mmio_base + MMIO_PPR_HEAD_OFFSET); + writel(0x00, iommu->mmio_base + MMIO_PPR_TAIL_OFFSET); + + iommu_feature_enable(iommu, CONTROL_PPFLOG_EN); + iommu_feature_enable(iommu, CONTROL_PPR_EN); +} + +static void __init free_ppr_log(struct amd_iommu *iommu) +{ + if (iommu->ppr_log == NULL) + return; + + free_pages((unsigned long)iommu->ppr_log, get_order(PPR_LOG_SIZE)); +} + +static void iommu_enable_gt(struct amd_iommu *iommu) +{ + if (!iommu_feature(iommu, FEATURE_GT)) + return; + + iommu_feature_enable(iommu, CONTROL_GT_EN); +} + /* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { - int i = (bit >> 5) & 0x07; - int _bit = bit & 0x1f; + int i = (bit >> 6) & 0x03; + int _bit = bit & 0x3f; - amd_iommu_dev_table[devid].data[i] |= (1 << _bit); + amd_iommu_dev_table[devid].data[i] |= (1UL << _bit); } static int get_dev_entry_bit(u16 devid, u8 bit) { - int i = (bit >> 5) & 0x07; - int _bit = bit & 0x1f; + int i = (bit >> 6) & 0x03; + int _bit = bit & 0x3f; - return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; + return (amd_iommu_dev_table[devid].data[i] & (1UL << _bit)) >> _bit; } @@ -699,6 +764,32 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->features = ((u64)high << 32) | low; + if (iommu_feature(iommu, FEATURE_GT)) { + int glxval; + u32 pasids; + u64 shift; + + shift = iommu->features & FEATURE_PASID_MASK; + shift >>= FEATURE_PASID_SHIFT; + pasids = (1 << shift); + + amd_iommu_max_pasids = min(amd_iommu_max_pasids, pasids); + + glxval = iommu->features & FEATURE_GLXVAL_MASK; + glxval >>= FEATURE_GLXVAL_SHIFT; + + if (amd_iommu_max_glx_val == -1) + amd_iommu_max_glx_val = glxval; + else + amd_iommu_max_glx_val = min(amd_iommu_max_glx_val, glxval); + } + + if (iommu_feature(iommu, FEATURE_GT) && + iommu_feature(iommu, FEATURE_PPR)) { + iommu->is_iommu_v2 = true; + amd_iommu_v2_present = true; + } + if (!is_rd890_iommu(iommu->dev)) return; @@ -901,6 +992,7 @@ static void __init free_iommu_one(struct amd_iommu *iommu) { free_command_buffer(iommu); free_event_buffer(iommu); + free_ppr_log(iommu); iommu_unmap_mmio_space(iommu); } @@ -964,6 +1056,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + if (iommu_feature(iommu, FEATURE_PPR)) { + iommu->ppr_log = alloc_ppr_log(iommu); + if (!iommu->ppr_log) + return -ENOMEM; + } + if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE)) amd_iommu_np_cache = true; @@ -1050,6 +1148,9 @@ static int iommu_setup_msi(struct amd_iommu *iommu) iommu->int_enabled = true; iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + if (iommu->ppr_log != NULL) + iommu_feature_enable(iommu, CONTROL_PPFINT_EN); + return 0; } @@ -1209,6 +1310,9 @@ static void iommu_init_flags(struct amd_iommu *iommu) * make IOMMU memory accesses cache coherent */ iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + + /* Set IOTLB invalidation timeout to 1s */ + iommu_set_inv_tlb_timeout(iommu, CTRL_INV_TO_1S); } static void iommu_apply_resume_quirks(struct amd_iommu *iommu) @@ -1274,6 +1378,8 @@ static void enable_iommus(void) iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); + iommu_enable_ppr_log(iommu); + iommu_enable_gt(iommu); iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); iommu_enable(iommu); @@ -1303,13 +1409,6 @@ static void amd_iommu_resume(void) /* re-load the hardware */ enable_iommus(); - - /* - * we have to flush after the IOMMUs are enabled because a - * disabled IOMMU will never execute the commands we send - */ - for_each_iommu(iommu) - iommu_flush_all_caches(iommu); } static int amd_iommu_suspend(void) @@ -1560,6 +1659,8 @@ static int __init parse_amd_iommu_options(char *str) amd_iommu_unmap_flush = true; if (strncmp(str, "off", 3) == 0) amd_iommu_disabled = true; + if (strncmp(str, "force_isolation", 15) == 0) + amd_iommu_force_isolation = true; } return 1; @@ -1572,3 +1673,9 @@ IOMMU_INIT_FINISH(amd_iommu_detect, gart_iommu_hole_init, 0, 0); + +bool amd_iommu_v2_supported(void) +{ + return amd_iommu_v2_present; +} +EXPORT_SYMBOL(amd_iommu_v2_supported); diff --git a/drivers/iommu/amd_iommu_proto.h b/drivers/iommu/amd_iommu_proto.h index 7ffaa64410b0..1a7f41c6cc66 100644 --- a/drivers/iommu/amd_iommu_proto.h +++ b/drivers/iommu/amd_iommu_proto.h @@ -31,6 +31,30 @@ extern int amd_iommu_init_devices(void); extern void amd_iommu_uninit_devices(void); extern void amd_iommu_init_notifier(void); extern void amd_iommu_init_api(void); + +/* IOMMUv2 specific functions */ +struct iommu_domain; + +extern bool amd_iommu_v2_supported(void); +extern int amd_iommu_register_ppr_notifier(struct notifier_block *nb); +extern int amd_iommu_unregister_ppr_notifier(struct notifier_block *nb); +extern void amd_iommu_domain_direct_map(struct iommu_domain *dom); +extern int amd_iommu_domain_enable_v2(struct iommu_domain *dom, int pasids); +extern int amd_iommu_flush_page(struct iommu_domain *dom, int pasid, + u64 address); +extern int amd_iommu_flush_tlb(struct iommu_domain *dom, int pasid); +extern int amd_iommu_domain_set_gcr3(struct iommu_domain *dom, int pasid, + unsigned long cr3); +extern int amd_iommu_domain_clear_gcr3(struct iommu_domain *dom, int pasid); +extern struct iommu_domain *amd_iommu_get_v2_domain(struct pci_dev *pdev); + +#define PPR_SUCCESS 0x0 +#define PPR_INVALID 0x1 +#define PPR_FAILURE 0xf + +extern int amd_iommu_complete_ppr(struct pci_dev *pdev, int pasid, + int status, int tag); + #ifndef CONFIG_AMD_IOMMU_STATS static inline void amd_iommu_stats_init(void) { } diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index 5b9c5075e81a..2452f3b71736 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -69,11 +69,14 @@ #define MMIO_EXCL_BASE_OFFSET 0x0020 #define MMIO_EXCL_LIMIT_OFFSET 0x0028 #define MMIO_EXT_FEATURES 0x0030 +#define MMIO_PPR_LOG_OFFSET 0x0038 #define MMIO_CMD_HEAD_OFFSET 0x2000 #define MMIO_CMD_TAIL_OFFSET 0x2008 #define MMIO_EVT_HEAD_OFFSET 0x2010 #define MMIO_EVT_TAIL_OFFSET 0x2018 #define MMIO_STATUS_OFFSET 0x2020 +#define MMIO_PPR_HEAD_OFFSET 0x2030 +#define MMIO_PPR_TAIL_OFFSET 0x2038 /* Extended Feature Bits */ @@ -87,8 +90,17 @@ #define FEATURE_HE (1ULL<<8) #define FEATURE_PC (1ULL<<9) +#define FEATURE_PASID_SHIFT 32 +#define FEATURE_PASID_MASK (0x1fULL << FEATURE_PASID_SHIFT) + +#define FEATURE_GLXVAL_SHIFT 14 +#define FEATURE_GLXVAL_MASK (0x03ULL << FEATURE_GLXVAL_SHIFT) + +#define PASID_MASK 0x000fffff + /* MMIO status bits */ -#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 +#define MMIO_STATUS_COM_WAIT_INT_MASK (1 << 2) +#define MMIO_STATUS_PPR_INT_MASK (1 << 6) /* event logging constants */ #define EVENT_ENTRY_SIZE 0x10 @@ -115,6 +127,7 @@ #define CONTROL_EVT_LOG_EN 0x02ULL #define CONTROL_EVT_INT_EN 0x03ULL #define CONTROL_COMWAIT_EN 0x04ULL +#define CONTROL_INV_TIMEOUT 0x05ULL #define CONTROL_PASSPW_EN 0x08ULL #define CONTROL_RESPASSPW_EN 0x09ULL #define CONTROL_COHERENT_EN 0x0aULL @@ -122,18 +135,34 @@ #define CONTROL_CMDBUF_EN 0x0cULL #define CONTROL_PPFLOG_EN 0x0dULL #define CONTROL_PPFINT_EN 0x0eULL +#define CONTROL_PPR_EN 0x0fULL +#define CONTROL_GT_EN 0x10ULL + +#define CTRL_INV_TO_MASK (7 << CONTROL_INV_TIMEOUT) +#define CTRL_INV_TO_NONE 0 +#define CTRL_INV_TO_1MS 1 +#define CTRL_INV_TO_10MS 2 +#define CTRL_INV_TO_100MS 3 +#define CTRL_INV_TO_1S 4 +#define CTRL_INV_TO_10S 5 +#define CTRL_INV_TO_100S 6 /* command specific defines */ #define CMD_COMPL_WAIT 0x01 #define CMD_INV_DEV_ENTRY 0x02 #define CMD_INV_IOMMU_PAGES 0x03 #define CMD_INV_IOTLB_PAGES 0x04 +#define CMD_COMPLETE_PPR 0x07 #define CMD_INV_ALL 0x08 #define CMD_COMPL_WAIT_STORE_MASK 0x01 #define CMD_COMPL_WAIT_INT_MASK 0x02 #define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 #define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 +#define CMD_INV_IOMMU_PAGES_GN_MASK 0x04 + +#define PPR_STATUS_MASK 0xf +#define PPR_STATUS_SHIFT 12 #define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL @@ -165,6 +194,23 @@ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ #define EVT_LEN_MASK (0x9ULL << 56) +/* Constants for PPR Log handling */ +#define PPR_LOG_ENTRIES 512 +#define PPR_LOG_SIZE_SHIFT 56 +#define PPR_LOG_SIZE_512 (0x9ULL << PPR_LOG_SIZE_SHIFT) +#define PPR_ENTRY_SIZE 16 +#define PPR_LOG_SIZE (PPR_ENTRY_SIZE * PPR_LOG_ENTRIES) + +#define PPR_REQ_TYPE(x) (((x) >> 60) & 0xfULL) +#define PPR_FLAGS(x) (((x) >> 48) & 0xfffULL) +#define PPR_DEVID(x) ((x) & 0xffffULL) +#define PPR_TAG(x) (((x) >> 32) & 0x3ffULL) +#define PPR_PASID1(x) (((x) >> 16) & 0xffffULL) +#define PPR_PASID2(x) (((x) >> 42) & 0xfULL) +#define PPR_PASID(x) ((PPR_PASID2(x) << 16) | PPR_PASID1(x)) + +#define PPR_REQ_FAULT 0x01 + #define PAGE_MODE_NONE 0x00 #define PAGE_MODE_1_LEVEL 0x01 #define PAGE_MODE_2_LEVEL 0x02 @@ -230,7 +276,24 @@ #define IOMMU_PTE_IR (1ULL << 61) #define IOMMU_PTE_IW (1ULL << 62) -#define DTE_FLAG_IOTLB 0x01 +#define DTE_FLAG_IOTLB (0x01UL << 32) +#define DTE_FLAG_GV (0x01ULL << 55) +#define DTE_GLX_SHIFT (56) +#define DTE_GLX_MASK (3) + +#define DTE_GCR3_VAL_A(x) (((x) >> 12) & 0x00007ULL) +#define DTE_GCR3_VAL_B(x) (((x) >> 15) & 0x0ffffULL) +#define DTE_GCR3_VAL_C(x) (((x) >> 31) & 0xfffffULL) + +#define DTE_GCR3_INDEX_A 0 +#define DTE_GCR3_INDEX_B 1 +#define DTE_GCR3_INDEX_C 1 + +#define DTE_GCR3_SHIFT_A 58 +#define DTE_GCR3_SHIFT_B 16 +#define DTE_GCR3_SHIFT_C 43 + +#define GCR3_VALID 0x01ULL #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) @@ -257,6 +320,7 @@ domain for an IOMMU */ #define PD_PASSTHROUGH_MASK (1UL << 2) /* domain has no page translation */ +#define PD_IOMMUV2_MASK (1UL << 3) /* domain has gcr3 table */ extern bool amd_iommu_dump; #define DUMP_printk(format, arg...) \ @@ -285,6 +349,29 @@ extern bool amd_iommu_iotlb_sup; #define APERTURE_RANGE_INDEX(a) ((a) >> APERTURE_RANGE_SHIFT) #define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL) + +/* + * This struct is used to pass information about + * incoming PPR faults around. + */ +struct amd_iommu_fault { + u64 address; /* IO virtual address of the fault*/ + u32 pasid; /* Address space identifier */ + u16 device_id; /* Originating PCI device id */ + u16 tag; /* PPR tag */ + u16 flags; /* Fault flags */ + +}; + +#define PPR_FAULT_EXEC (1 << 1) +#define PPR_FAULT_READ (1 << 2) +#define PPR_FAULT_WRITE (1 << 5) +#define PPR_FAULT_USER (1 << 6) +#define PPR_FAULT_RSVD (1 << 7) +#define PPR_FAULT_GN (1 << 8) + +struct iommu_domain; + /* * This structure contains generic data for IOMMU protection domains * independent of their use. @@ -297,11 +384,15 @@ struct protection_domain { u16 id; /* the domain id written to the device table */ int mode; /* paging mode (0-6 levels) */ u64 *pt_root; /* page table root pointer */ + int glx; /* Number of levels for GCR3 table */ + u64 *gcr3_tbl; /* Guest CR3 table */ unsigned long flags; /* flags to find out type of domain */ bool updated; /* complete domain flush required */ unsigned dev_cnt; /* devices assigned to this domain */ unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */ void *priv; /* private data */ + struct iommu_domain *iommu_domain; /* Pointer to generic + domain structure */ }; @@ -315,10 +406,15 @@ struct iommu_dev_data { struct protection_domain *domain; /* Domain the device is bound to */ atomic_t bind; /* Domain attach reverent count */ u16 devid; /* PCI Device ID */ + bool iommu_v2; /* Device can make use of IOMMUv2 */ + bool passthrough; /* Default for device is pt_domain */ struct { bool enabled; int qdep; } ats; /* ATS state */ + bool pri_tlp; /* PASID TLB required for + PPR completions */ + u32 errata; /* Bitmap for errata to apply */ }; /* @@ -399,6 +495,9 @@ struct amd_iommu { /* Extended features */ u64 features; + /* IOMMUv2 */ + bool is_iommu_v2; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -431,6 +530,9 @@ struct amd_iommu { /* MSI number for event interrupt */ u16 evt_msi_num; + /* Base of the PPR log, if present */ + u8 *ppr_log; + /* true if interrupts for this IOMMU are already enabled */ bool int_enabled; @@ -484,7 +586,7 @@ extern struct list_head amd_iommu_pd_list; * Structure defining one entry in the device table */ struct dev_table_entry { - u32 data[8]; + u64 data[4]; }; /* @@ -549,6 +651,16 @@ extern unsigned long *amd_iommu_pd_alloc_bitmap; */ extern bool amd_iommu_unmap_flush; +/* Smallest number of PASIDs supported by any IOMMU in the system */ +extern u32 amd_iommu_max_pasids; + +extern bool amd_iommu_v2_present; + +extern bool amd_iommu_force_isolation; + +/* Max levels of glxval supported */ +extern int amd_iommu_max_glx_val; + /* takes bus and device/function and returns the device id * FIXME: should that be in generic PCI code? */ static inline u16 calc_devid(u8 bus, u8 devfn) diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c new file mode 100644 index 000000000000..8add9f125d3e --- /dev/null +++ b/drivers/iommu/amd_iommu_v2.c @@ -0,0 +1,994 @@ +/* + * Copyright (C) 2010-2012 Advanced Micro Devices, Inc. + * Author: Joerg Roedel <joerg.roedel@amd.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/mmu_notifier.h> +#include <linux/amd-iommu.h> +#include <linux/mm_types.h> +#include <linux/profile.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/iommu.h> +#include <linux/wait.h> +#include <linux/pci.h> +#include <linux/gfp.h> + +#include "amd_iommu_types.h" +#include "amd_iommu_proto.h" + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Joerg Roedel <joerg.roedel@amd.com>"); + +#define MAX_DEVICES 0x10000 +#define PRI_QUEUE_SIZE 512 + +struct pri_queue { + atomic_t inflight; + bool finish; + int status; +}; + +struct pasid_state { + struct list_head list; /* For global state-list */ + atomic_t count; /* Reference count */ + struct task_struct *task; /* Task bound to this PASID */ + struct mm_struct *mm; /* mm_struct for the faults */ + struct mmu_notifier mn; /* mmu_otifier handle */ + struct pri_queue pri[PRI_QUEUE_SIZE]; /* PRI tag states */ + struct device_state *device_state; /* Link to our device_state */ + int pasid; /* PASID index */ + spinlock_t lock; /* Protect pri_queues */ + wait_queue_head_t wq; /* To wait for count == 0 */ +}; + +struct device_state { + atomic_t count; + struct pci_dev *pdev; + struct pasid_state **states; + struct iommu_domain *domain; + int pasid_levels; + int max_pasids; + amd_iommu_invalid_ppr_cb inv_ppr_cb; + amd_iommu_invalidate_ctx inv_ctx_cb; + spinlock_t lock; + wait_queue_head_t wq; +}; + +struct fault { + struct work_struct work; + struct device_state *dev_state; + struct pasid_state *state; + struct mm_struct *mm; + u64 address; + u16 devid; + u16 pasid; + u16 tag; + u16 finish; + u16 flags; +}; + +struct device_state **state_table; +static spinlock_t state_lock; + +/* List and lock for all pasid_states */ +static LIST_HEAD(pasid_state_list); +static DEFINE_SPINLOCK(ps_lock); + +static struct workqueue_struct *iommu_wq; + +/* + * Empty page table - Used between + * mmu_notifier_invalidate_range_start and + * mmu_notifier_invalidate_range_end + */ +static u64 *empty_page_table; + +static void free_pasid_states(struct device_state *dev_state); +static void unbind_pasid(struct device_state *dev_state, int pasid); +static int task_exit(struct notifier_block *nb, unsigned long e, void *data); + +static u16 device_id(struct pci_dev *pdev) +{ + u16 devid; + + devid = pdev->bus->number; + devid = (devid << 8) | pdev->devfn; + + return devid; +} + +static struct device_state *get_device_state(u16 devid) +{ + struct device_state *dev_state; + unsigned long flags; + + spin_lock_irqsave(&state_lock, flags); + dev_state = state_table[devid]; + if (dev_state != NULL) + atomic_inc(&dev_state->count); + spin_unlock_irqrestore(&state_lock, flags); + + return dev_state; +} + +static void free_device_state(struct device_state *dev_state) +{ + /* + * First detach device from domain - No more PRI requests will arrive + * from that device after it is unbound from the IOMMUv2 domain. + */ + iommu_detach_device(dev_state->domain, &dev_state->pdev->dev); + + /* Everything is down now, free the IOMMUv2 domain */ + iommu_domain_free(dev_state->domain); + + /* Finally get rid of the device-state */ + kfree(dev_state); +} + +static void put_device_state(struct device_state *dev_state) +{ + if (atomic_dec_and_test(&dev_state->count)) + wake_up(&dev_state->wq); +} + +static void put_device_state_wait(struct device_state *dev_state) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&dev_state->wq, &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_dec_and_test(&dev_state->count)) + schedule(); + finish_wait(&dev_state->wq, &wait); + + free_device_state(dev_state); +} + +static struct notifier_block profile_nb = { + .notifier_call = task_exit, +}; + +static void link_pasid_state(struct pasid_state *pasid_state) +{ + spin_lock(&ps_lock); + list_add_tail(&pasid_state->list, &pasid_state_list); + spin_unlock(&ps_lock); +} + +static void __unlink_pasid_state(struct pasid_state *pasid_state) +{ + list_del(&pasid_state->list); +} + +static void unlink_pasid_state(struct pasid_state *pasid_state) +{ + spin_lock(&ps_lock); + __unlink_pasid_state(pasid_state); + spin_unlock(&ps_lock); +} + +/* Must be called under dev_state->lock */ +static struct pasid_state **__get_pasid_state_ptr(struct device_state *dev_state, + int pasid, bool alloc) +{ + struct pasid_state **root, **ptr; + int level, index; + + level = dev_state->pasid_levels; + root = dev_state->states; + + while (true) { + + index = (pasid >> (9 * level)) & 0x1ff; + ptr = &root[index]; + + if (level == 0) + break; + + if (*ptr == NULL) { + if (!alloc) + return NULL; + + *ptr = (void *)get_zeroed_page(GFP_ATOMIC); + if (*ptr == NULL) + return NULL; + } + + root = (struct pasid_state **)*ptr; + level -= 1; + } + + return ptr; +} + +static int set_pasid_state(struct device_state *dev_state, + struct pasid_state *pasid_state, + int pasid) +{ + struct pasid_state **ptr; + unsigned long flags; + int ret; + + spin_lock_irqsave(&dev_state->lock, flags); + ptr = __get_pasid_state_ptr(dev_state, pasid, true); + + ret = -ENOMEM; + if (ptr == NULL) + goto out_unlock; + + ret = -ENOMEM; + if (*ptr != NULL) + goto out_unlock; + + *ptr = pasid_state; + + ret = 0; + +out_unlock: + spin_unlock_irqrestore(&dev_state->lock, flags); + + return ret; +} + +static void clear_pasid_state(struct device_state *dev_state, int pasid) +{ + struct pasid_state **ptr; + unsigned long flags; + + spin_lock_irqsave(&dev_state->lock, flags); + ptr = __get_pasid_state_ptr(dev_state, pasid, true); + + if (ptr == NULL) + goto out_unlock; + + *ptr = NULL; + +out_unlock: + spin_unlock_irqrestore(&dev_state->lock, flags); +} + +static struct pasid_state *get_pasid_state(struct device_state *dev_state, + int pasid) +{ + struct pasid_state **ptr, *ret = NULL; + unsigned long flags; + + spin_lock_irqsave(&dev_state->lock, flags); + ptr = __get_pasid_state_ptr(dev_state, pasid, false); + + if (ptr == NULL) + goto out_unlock; + + ret = *ptr; + if (ret) + atomic_inc(&ret->count); + +out_unlock: + spin_unlock_irqrestore(&dev_state->lock, flags); + + return ret; +} + +static void free_pasid_state(struct pasid_state *pasid_state) +{ + kfree(pasid_state); +} + +static void put_pasid_state(struct pasid_state *pasid_state) +{ + if (atomic_dec_and_test(&pasid_state->count)) { + put_device_state(pasid_state->device_state); + wake_up(&pasid_state->wq); + } +} + +static void put_pasid_state_wait(struct pasid_state *pasid_state) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&pasid_state->wq, &wait, TASK_UNINTERRUPTIBLE); + + if (atomic_dec_and_test(&pasid_state->count)) + put_device_state(pasid_state->device_state); + else + schedule(); + + finish_wait(&pasid_state->wq, &wait); + mmput(pasid_state->mm); + free_pasid_state(pasid_state); +} + +static void __unbind_pasid(struct pasid_state *pasid_state) +{ + struct iommu_domain *domain; + + domain = pasid_state->device_state->domain; + + amd_iommu_domain_clear_gcr3(domain, pasid_state->pasid); + clear_pasid_state(pasid_state->device_state, pasid_state->pasid); + + /* Make sure no more pending faults are in the queue */ + flush_workqueue(iommu_wq); + + mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + + put_pasid_state(pasid_state); /* Reference taken in bind() function */ +} + +static void unbind_pasid(struct device_state *dev_state, int pasid) +{ + struct pasid_state *pasid_state; + + pasid_state = get_pasid_state(dev_state, pasid); + if (pasid_state == NULL) + return; + + unlink_pasid_state(pasid_state); + __unbind_pasid(pasid_state); + put_pasid_state_wait(pasid_state); /* Reference taken in this function */ +} + +static void free_pasid_states_level1(struct pasid_state **tbl) +{ + int i; + + for (i = 0; i < 512; ++i) { + if (tbl[i] == NULL) + continue; + + free_page((unsigned long)tbl[i]); + } +} + +static void free_pasid_states_level2(struct pasid_state **tbl) +{ + struct pasid_state **ptr; + int i; + + for (i = 0; i < 512; ++i) { + if (tbl[i] == NULL) + continue; + + ptr = (struct pasid_state **)tbl[i]; + free_pasid_states_level1(ptr); + } +} + +static void free_pasid_states(struct device_state *dev_state) +{ + struct pasid_state *pasid_state; + int i; + + for (i = 0; i < dev_state->max_pasids; ++i) { + pasid_state = get_pasid_state(dev_state, i); + if (pasid_state == NULL) + continue; + + put_pasid_state(pasid_state); + unbind_pasid(dev_state, i); + } + + if (dev_state->pasid_levels == 2) + free_pasid_states_level2(dev_state->states); + else if (dev_state->pasid_levels == 1) + free_pasid_states_level1(dev_state->states); + else if (dev_state->pasid_levels != 0) + BUG(); + + free_page((unsigned long)dev_state->states); +} + +static struct pasid_state *mn_to_state(struct mmu_notifier *mn) +{ + return container_of(mn, struct pasid_state, mn); +} + +static void __mn_flush_page(struct mmu_notifier *mn, + unsigned long address) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + + amd_iommu_flush_page(dev_state->domain, pasid_state->pasid, address); +} + +static int mn_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + __mn_flush_page(mn, address); + + return 0; +} + +static void mn_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + __mn_flush_page(mn, address); +} + +static void mn_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + __mn_flush_page(mn, address); +} + +static void mn_invalidate_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + + amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid, + __pa(empty_page_table)); +} + +static void mn_invalidate_range_end(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + + pasid_state = mn_to_state(mn); + dev_state = pasid_state->device_state; + + amd_iommu_domain_set_gcr3(dev_state->domain, pasid_state->pasid, + __pa(pasid_state->mm->pgd)); +} + +static struct mmu_notifier_ops iommu_mn = { + .clear_flush_young = mn_clear_flush_young, + .change_pte = mn_change_pte, + .invalidate_page = mn_invalidate_page, + .invalidate_range_start = mn_invalidate_range_start, + .invalidate_range_end = mn_invalidate_range_end, +}; + +static void set_pri_tag_status(struct pasid_state *pasid_state, + u16 tag, int status) +{ + unsigned long flags; + + spin_lock_irqsave(&pasid_state->lock, flags); + pasid_state->pri[tag].status = status; + spin_unlock_irqrestore(&pasid_state->lock, flags); +} + +static void finish_pri_tag(struct device_state *dev_state, + struct pasid_state *pasid_state, + u16 tag) +{ + unsigned long flags; + + spin_lock_irqsave(&pasid_state->lock, flags); + if (atomic_dec_and_test(&pasid_state->pri[tag].inflight) && + pasid_state->pri[tag].finish) { + amd_iommu_complete_ppr(dev_state->pdev, pasid_state->pasid, + pasid_state->pri[tag].status, tag); + pasid_state->pri[tag].finish = false; + pasid_state->pri[tag].status = PPR_SUCCESS; + } + spin_unlock_irqrestore(&pasid_state->lock, flags); +} + +static void do_fault(struct work_struct *work) +{ + struct fault *fault = container_of(work, struct fault, work); + int npages, write; + struct page *page; + + write = !!(fault->flags & PPR_FAULT_WRITE); + + npages = get_user_pages(fault->state->task, fault->state->mm, + fault->address, 1, write, 0, &page, NULL); + + if (npages == 1) { + put_page(page); + } else if (fault->dev_state->inv_ppr_cb) { + int status; + + status = fault->dev_state->inv_ppr_cb(fault->dev_state->pdev, + fault->pasid, + fault->address, + fault->flags); + switch (status) { + case AMD_IOMMU_INV_PRI_RSP_SUCCESS: + set_pri_tag_status(fault->state, fault->tag, PPR_SUCCESS); + break; + case AMD_IOMMU_INV_PRI_RSP_INVALID: + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + break; + case AMD_IOMMU_INV_PRI_RSP_FAIL: + set_pri_tag_status(fault->state, fault->tag, PPR_FAILURE); + break; + default: + BUG(); + } + } else { + set_pri_tag_status(fault->state, fault->tag, PPR_INVALID); + } + + finish_pri_tag(fault->dev_state, fault->state, fault->tag); + + put_pasid_state(fault->state); + + kfree(fault); +} + +static int ppr_notifier(struct notifier_block *nb, unsigned long e, void *data) +{ + struct amd_iommu_fault *iommu_fault; + struct pasid_state *pasid_state; + struct device_state *dev_state; + unsigned long flags; + struct fault *fault; + bool finish; + u16 tag; + int ret; + + iommu_fault = data; + tag = iommu_fault->tag & 0x1ff; + finish = (iommu_fault->tag >> 9) & 1; + + ret = NOTIFY_DONE; + dev_state = get_device_state(iommu_fault->device_id); + if (dev_state == NULL) + goto out; + + pasid_state = get_pasid_state(dev_state, iommu_fault->pasid); + if (pasid_state == NULL) { + /* We know the device but not the PASID -> send INVALID */ + amd_iommu_complete_ppr(dev_state->pdev, iommu_fault->pasid, + PPR_INVALID, tag); + goto out_drop_state; + } + + spin_lock_irqsave(&pasid_state->lock, flags); + atomic_inc(&pasid_state->pri[tag].inflight); + if (finish) + pasid_state->pri[tag].finish = true; + spin_unlock_irqrestore(&pasid_state->lock, flags); + + fault = kzalloc(sizeof(*fault), GFP_ATOMIC); + if (fault == NULL) { + /* We are OOM - send success and let the device re-fault */ + finish_pri_tag(dev_state, pasid_state, tag); + goto out_drop_state; + } + + fault->dev_state = dev_state; + fault->address = iommu_fault->address; + fault->state = pasid_state; + fault->tag = tag; + fault->finish = finish; + fault->flags = iommu_fault->flags; + INIT_WORK(&fault->work, do_fault); + + queue_work(iommu_wq, &fault->work); + + ret = NOTIFY_OK; + +out_drop_state: + put_device_state(dev_state); + +out: + return ret; +} + +static struct notifier_block ppr_nb = { + .notifier_call = ppr_notifier, +}; + +static int task_exit(struct notifier_block *nb, unsigned long e, void *data) +{ + struct pasid_state *pasid_state; + struct task_struct *task; + + task = data; + + /* + * Using this notifier is a hack - but there is no other choice + * at the moment. What I really want is a sleeping notifier that + * is called when an MM goes down. But such a notifier doesn't + * exist yet. The notifier needs to sleep because it has to make + * sure that the device does not use the PASID and the address + * space anymore before it is destroyed. This includes waiting + * for pending PRI requests to pass the workqueue. The + * MMU-Notifiers would be a good fit, but they use RCU and so + * they are not allowed to sleep. Lets see how we can solve this + * in a more intelligent way in the future. + */ +again: + spin_lock(&ps_lock); + list_for_each_entry(pasid_state, &pasid_state_list, list) { + struct device_state *dev_state; + int pasid; + + if (pasid_state->task != task) + continue; + + /* Drop Lock and unbind */ + spin_unlock(&ps_lock); + + dev_state = pasid_state->device_state; + pasid = pasid_state->pasid; + + if (pasid_state->device_state->inv_ctx_cb) + dev_state->inv_ctx_cb(dev_state->pdev, pasid); + + unbind_pasid(dev_state, pasid); + + /* Task may be in the list multiple times */ + goto again; + } + spin_unlock(&ps_lock); + + return NOTIFY_OK; +} + +int amd_iommu_bind_pasid(struct pci_dev *pdev, int pasid, + struct task_struct *task) +{ + struct pasid_state *pasid_state; + struct device_state *dev_state; + u16 devid; + int ret; + + might_sleep(); + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + devid = device_id(pdev); + dev_state = get_device_state(devid); + + if (dev_state == NULL) + return -EINVAL; + + ret = -EINVAL; + if (pasid < 0 || pasid >= dev_state->max_pasids) + goto out; + + ret = -ENOMEM; + pasid_state = kzalloc(sizeof(*pasid_state), GFP_KERNEL); + if (pasid_state == NULL) + goto out; + + atomic_set(&pasid_state->count, 1); + init_waitqueue_head(&pasid_state->wq); + pasid_state->task = task; + pasid_state->mm = get_task_mm(task); + pasid_state->device_state = dev_state; + pasid_state->pasid = pasid; + pasid_state->mn.ops = &iommu_mn; + + if (pasid_state->mm == NULL) + goto out_free; + + mmu_notifier_register(&pasid_state->mn, pasid_state->mm); + + ret = set_pasid_state(dev_state, pasid_state, pasid); + if (ret) + goto out_unregister; + + ret = amd_iommu_domain_set_gcr3(dev_state->domain, pasid, + __pa(pasid_state->mm->pgd)); + if (ret) + goto out_clear_state; + + link_pasid_state(pasid_state); + + return 0; + +out_clear_state: + clear_pasid_state(dev_state, pasid); + +out_unregister: + mmu_notifier_unregister(&pasid_state->mn, pasid_state->mm); + +out_free: + free_pasid_state(pasid_state); + +out: + put_device_state(dev_state); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_bind_pasid); + +void amd_iommu_unbind_pasid(struct pci_dev *pdev, int pasid) +{ + struct device_state *dev_state; + u16 devid; + + might_sleep(); + + if (!amd_iommu_v2_supported()) + return; + + devid = device_id(pdev); + dev_state = get_device_state(devid); + if (dev_state == NULL) + return; + + if (pasid < 0 || pasid >= dev_state->max_pasids) + goto out; + + unbind_pasid(dev_state, pasid); + +out: + put_device_state(dev_state); +} +EXPORT_SYMBOL(amd_iommu_unbind_pasid); + +int amd_iommu_init_device(struct pci_dev *pdev, int pasids) +{ + struct device_state *dev_state; + unsigned long flags; + int ret, tmp; + u16 devid; + + might_sleep(); + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + if (pasids <= 0 || pasids > (PASID_MASK + 1)) + return -EINVAL; + + devid = device_id(pdev); + + dev_state = kzalloc(sizeof(*dev_state), GFP_KERNEL); + if (dev_state == NULL) + return -ENOMEM; + + spin_lock_init(&dev_state->lock); + init_waitqueue_head(&dev_state->wq); + dev_state->pdev = pdev; + + tmp = pasids; + for (dev_state->pasid_levels = 0; (tmp - 1) & ~0x1ff; tmp >>= 9) + dev_state->pasid_levels += 1; + + atomic_set(&dev_state->count, 1); + dev_state->max_pasids = pasids; + + ret = -ENOMEM; + dev_state->states = (void *)get_zeroed_page(GFP_KERNEL); + if (dev_state->states == NULL) + goto out_free_dev_state; + + dev_state->domain = iommu_domain_alloc(&pci_bus_type); + if (dev_state->domain == NULL) + goto out_free_states; + + amd_iommu_domain_direct_map(dev_state->domain); + + ret = amd_iommu_domain_enable_v2(dev_state->domain, pasids); + if (ret) + goto out_free_domain; + + ret = iommu_attach_device(dev_state->domain, &pdev->dev); + if (ret != 0) + goto out_free_domain; + + spin_lock_irqsave(&state_lock, flags); + + if (state_table[devid] != NULL) { + spin_unlock_irqrestore(&state_lock, flags); + ret = -EBUSY; + goto out_free_domain; + } + + state_table[devid] = dev_state; + + spin_unlock_irqrestore(&state_lock, flags); + + return 0; + +out_free_domain: + iommu_domain_free(dev_state->domain); + +out_free_states: + free_page((unsigned long)dev_state->states); + +out_free_dev_state: + kfree(dev_state); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_init_device); + +void amd_iommu_free_device(struct pci_dev *pdev) +{ + struct device_state *dev_state; + unsigned long flags; + u16 devid; + + if (!amd_iommu_v2_supported()) + return; + + devid = device_id(pdev); + + spin_lock_irqsave(&state_lock, flags); + + dev_state = state_table[devid]; + if (dev_state == NULL) { + spin_unlock_irqrestore(&state_lock, flags); + return; + } + + state_table[devid] = NULL; + + spin_unlock_irqrestore(&state_lock, flags); + + /* Get rid of any remaining pasid states */ + free_pasid_states(dev_state); + + put_device_state_wait(dev_state); +} +EXPORT_SYMBOL(amd_iommu_free_device); + +int amd_iommu_set_invalid_ppr_cb(struct pci_dev *pdev, + amd_iommu_invalid_ppr_cb cb) +{ + struct device_state *dev_state; + unsigned long flags; + u16 devid; + int ret; + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + devid = device_id(pdev); + + spin_lock_irqsave(&state_lock, flags); + + ret = -EINVAL; + dev_state = state_table[devid]; + if (dev_state == NULL) + goto out_unlock; + + dev_state->inv_ppr_cb = cb; + + ret = 0; + +out_unlock: + spin_unlock_irqrestore(&state_lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_set_invalid_ppr_cb); + +int amd_iommu_set_invalidate_ctx_cb(struct pci_dev *pdev, + amd_iommu_invalidate_ctx cb) +{ + struct device_state *dev_state; + unsigned long flags; + u16 devid; + int ret; + + if (!amd_iommu_v2_supported()) + return -ENODEV; + + devid = device_id(pdev); + + spin_lock_irqsave(&state_lock, flags); + + ret = -EINVAL; + dev_state = state_table[devid]; + if (dev_state == NULL) + goto out_unlock; + + dev_state->inv_ctx_cb = cb; + + ret = 0; + +out_unlock: + spin_unlock_irqrestore(&state_lock, flags); + + return ret; +} +EXPORT_SYMBOL(amd_iommu_set_invalidate_ctx_cb); + +static int __init amd_iommu_v2_init(void) +{ + size_t state_table_size; + int ret; + + pr_info("AMD IOMMUv2 driver by Joerg Roedel <joerg.roedel@amd.com>"); + + spin_lock_init(&state_lock); + + state_table_size = MAX_DEVICES * sizeof(struct device_state *); + state_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(state_table_size)); + if (state_table == NULL) + return -ENOMEM; + + ret = -ENOMEM; + iommu_wq = create_workqueue("amd_iommu_v2"); + if (iommu_wq == NULL) + goto out_free; + + ret = -ENOMEM; + empty_page_table = (u64 *)get_zeroed_page(GFP_KERNEL); + if (empty_page_table == NULL) + goto out_destroy_wq; + + amd_iommu_register_ppr_notifier(&ppr_nb); + profile_event_register(PROFILE_TASK_EXIT, &profile_nb); + + return 0; + +out_destroy_wq: + destroy_workqueue(iommu_wq); + +out_free: + free_pages((unsigned long)state_table, get_order(state_table_size)); + + return ret; +} + +static void __exit amd_iommu_v2_exit(void) +{ + struct device_state *dev_state; + size_t state_table_size; + int i; + + profile_event_unregister(PROFILE_TASK_EXIT, &profile_nb); + amd_iommu_unregister_ppr_notifier(&ppr_nb); + + flush_workqueue(iommu_wq); + + /* + * The loop below might call flush_workqueue(), so call + * destroy_workqueue() after it + */ + for (i = 0; i < MAX_DEVICES; ++i) { + dev_state = get_device_state(i); + + if (dev_state == NULL) + continue; + + WARN_ON_ONCE(1); + + put_device_state(dev_state); + amd_iommu_free_device(dev_state->pdev); + } + + destroy_workqueue(iommu_wq); + + state_table_size = MAX_DEVICES * sizeof(struct device_state *); + free_pages((unsigned long)state_table, get_order(state_table_size)); + + free_page((unsigned long)empty_page_table); +} + +module_init(amd_iommu_v2_init); +module_exit(amd_iommu_v2_exit); diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 31053a951c34..c9c6053198d4 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -79,6 +79,24 @@ #define LEVEL_STRIDE (9) #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1) +/* + * This bitmap is used to advertise the page sizes our hardware support + * to the IOMMU core, which will then use this information to split + * physically contiguous memory regions it is mapping into page sizes + * that we support. + * + * Traditionally the IOMMU core just handed us the mappings directly, + * after making sure the size is an order of a 4KiB page and that the + * mapping has natural alignment. + * + * To retain this behavior, we currently advertise that we support + * all page sizes that are an order of 4KiB. + * + * If at some point we'd like to utilize the IOMMU core's new behavior, + * we could change this to advertise the real page sizes we support. + */ +#define INTEL_IOMMU_PGSIZES (~0xFFFUL) + static inline int agaw_to_level(int agaw) { return agaw + 2; @@ -3979,12 +3997,11 @@ static void intel_iommu_detach_device(struct iommu_domain *domain, static int intel_iommu_map(struct iommu_domain *domain, unsigned long iova, phys_addr_t hpa, - int gfp_order, int iommu_prot) + size_t size, int iommu_prot) { struct dmar_domain *dmar_domain = domain->priv; u64 max_addr; int prot = 0; - size_t size; int ret; if (iommu_prot & IOMMU_READ) @@ -3994,7 +4011,6 @@ static int intel_iommu_map(struct iommu_domain *domain, if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping) prot |= DMA_PTE_SNP; - size = PAGE_SIZE << gfp_order; max_addr = iova + size; if (dmar_domain->max_addr < max_addr) { u64 end; @@ -4017,11 +4033,10 @@ static int intel_iommu_map(struct iommu_domain *domain, return ret; } -static int intel_iommu_unmap(struct iommu_domain *domain, - unsigned long iova, int gfp_order) +static size_t intel_iommu_unmap(struct iommu_domain *domain, + unsigned long iova, size_t size) { struct dmar_domain *dmar_domain = domain->priv; - size_t size = PAGE_SIZE << gfp_order; int order; order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT, @@ -4030,7 +4045,7 @@ static int intel_iommu_unmap(struct iommu_domain *domain, if (dmar_domain->max_addr == iova + size) dmar_domain->max_addr = iova; - return order; + return PAGE_SIZE << order; } static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, @@ -4060,6 +4075,54 @@ static int intel_iommu_domain_has_cap(struct iommu_domain *domain, return 0; } +/* + * Group numbers are arbitrary. Device with the same group number + * indicate the iommu cannot differentiate between them. To avoid + * tracking used groups we just use the seg|bus|devfn of the lowest + * level we're able to differentiate devices + */ +static int intel_iommu_device_group(struct device *dev, unsigned int *groupid) +{ + struct pci_dev *pdev = to_pci_dev(dev); + struct pci_dev *bridge; + union { + struct { + u8 devfn; + u8 bus; + u16 segment; + } pci; + u32 group; + } id; + + if (iommu_no_mapping(dev)) + return -ENODEV; + + id.pci.segment = pci_domain_nr(pdev->bus); + id.pci.bus = pdev->bus->number; + id.pci.devfn = pdev->devfn; + + if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn)) + return -ENODEV; + + bridge = pci_find_upstream_pcie_bridge(pdev); + if (bridge) { + if (pci_is_pcie(bridge)) { + id.pci.bus = bridge->subordinate->number; + id.pci.devfn = 0; + } else { + id.pci.bus = bridge->bus->number; + id.pci.devfn = bridge->devfn; + } + } + + if (!pdev->is_virtfn && iommu_group_mf) + id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0); + + *groupid = id.group; + + return 0; +} + static struct iommu_ops intel_iommu_ops = { .domain_init = intel_iommu_domain_init, .domain_destroy = intel_iommu_domain_destroy, @@ -4069,6 +4132,8 @@ static struct iommu_ops intel_iommu_ops = { .unmap = intel_iommu_unmap, .iova_to_phys = intel_iommu_iova_to_phys, .domain_has_cap = intel_iommu_domain_has_cap, + .device_group = intel_iommu_device_group, + .pgsize_bitmap = INTEL_IOMMU_PGSIZES, }; static void __devinit quirk_iommu_rwbf(struct pci_dev *dev) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5b5fa5cdaa31..2198b2dbbcd3 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -16,6 +16,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#define pr_fmt(fmt) "%s: " fmt, __func__ + #include <linux/device.h> #include <linux/kernel.h> #include <linux/bug.h> @@ -25,8 +27,59 @@ #include <linux/errno.h> #include <linux/iommu.h> +static ssize_t show_iommu_group(struct device *dev, + struct device_attribute *attr, char *buf) +{ + unsigned int groupid; + + if (iommu_device_group(dev, &groupid)) + return 0; + + return sprintf(buf, "%u", groupid); +} +static DEVICE_ATTR(iommu_group, S_IRUGO, show_iommu_group, NULL); + +static int add_iommu_group(struct device *dev, void *data) +{ + unsigned int groupid; + + if (iommu_device_group(dev, &groupid) == 0) + return device_create_file(dev, &dev_attr_iommu_group); + + return 0; +} + +static int remove_iommu_group(struct device *dev) +{ + unsigned int groupid; + + if (iommu_device_group(dev, &groupid) == 0) + device_remove_file(dev, &dev_attr_iommu_group); + + return 0; +} + +static int iommu_device_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + + if (action == BUS_NOTIFY_ADD_DEVICE) + return add_iommu_group(dev, NULL); + else if (action == BUS_NOTIFY_DEL_DEVICE) + return remove_iommu_group(dev); + + return 0; +} + +static struct notifier_block iommu_device_nb = { + .notifier_call = iommu_device_notifier, +}; + static void iommu_bus_init(struct bus_type *bus, struct iommu_ops *ops) { + bus_register_notifier(bus, &iommu_device_nb); + bus_for_each_dev(bus, NULL, NULL, add_iommu_group); } /** @@ -157,32 +210,134 @@ int iommu_domain_has_cap(struct iommu_domain *domain, EXPORT_SYMBOL_GPL(iommu_domain_has_cap); int iommu_map(struct iommu_domain *domain, unsigned long iova, - phys_addr_t paddr, int gfp_order, int prot) + phys_addr_t paddr, size_t size, int prot) { - size_t size; + unsigned long orig_iova = iova; + unsigned int min_pagesz; + size_t orig_size = size; + int ret = 0; if (unlikely(domain->ops->map == NULL)) return -ENODEV; - size = PAGE_SIZE << gfp_order; + /* find out the minimum page size supported */ + min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap); + + /* + * both the virtual address and the physical one, as well as + * the size of the mapping, must be aligned (at least) to the + * size of the smallest page supported by the hardware + */ + if (!IS_ALIGNED(iova | paddr | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx pa 0x%lx size 0x%lx min_pagesz " + "0x%x\n", iova, (unsigned long)paddr, + (unsigned long)size, min_pagesz); + return -EINVAL; + } + + pr_debug("map: iova 0x%lx pa 0x%lx size 0x%lx\n", iova, + (unsigned long)paddr, (unsigned long)size); + + while (size) { + unsigned long pgsize, addr_merge = iova | paddr; + unsigned int pgsize_idx; + + /* Max page size that still fits into 'size' */ + pgsize_idx = __fls(size); + + /* need to consider alignment requirements ? */ + if (likely(addr_merge)) { + /* Max page size allowed by both iova and paddr */ + unsigned int align_pgsize_idx = __ffs(addr_merge); + + pgsize_idx = min(pgsize_idx, align_pgsize_idx); + } + + /* build a mask of acceptable page sizes */ + pgsize = (1UL << (pgsize_idx + 1)) - 1; - BUG_ON(!IS_ALIGNED(iova | paddr, size)); + /* throw away page sizes not supported by the hardware */ + pgsize &= domain->ops->pgsize_bitmap; - return domain->ops->map(domain, iova, paddr, gfp_order, prot); + /* make sure we're still sane */ + BUG_ON(!pgsize); + + /* pick the biggest page */ + pgsize_idx = __fls(pgsize); + pgsize = 1UL << pgsize_idx; + + pr_debug("mapping: iova 0x%lx pa 0x%lx pgsize %lu\n", iova, + (unsigned long)paddr, pgsize); + + ret = domain->ops->map(domain, iova, paddr, pgsize, prot); + if (ret) + break; + + iova += pgsize; + paddr += pgsize; + size -= pgsize; + } + + /* unroll mapping in case something went wrong */ + if (ret) + iommu_unmap(domain, orig_iova, orig_size - size); + + return ret; } EXPORT_SYMBOL_GPL(iommu_map); -int iommu_unmap(struct iommu_domain *domain, unsigned long iova, int gfp_order) +size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size) { - size_t size; + size_t unmapped_page, unmapped = 0; + unsigned int min_pagesz; if (unlikely(domain->ops->unmap == NULL)) return -ENODEV; - size = PAGE_SIZE << gfp_order; + /* find out the minimum page size supported */ + min_pagesz = 1 << __ffs(domain->ops->pgsize_bitmap); + + /* + * The virtual address, as well as the size of the mapping, must be + * aligned (at least) to the size of the smallest page supported + * by the hardware + */ + if (!IS_ALIGNED(iova | size, min_pagesz)) { + pr_err("unaligned: iova 0x%lx size 0x%lx min_pagesz 0x%x\n", + iova, (unsigned long)size, min_pagesz); + return -EINVAL; + } + + pr_debug("unmap this: iova 0x%lx size 0x%lx\n", iova, + (unsigned long)size); + + /* + * Keep iterating until we either unmap 'size' bytes (or more) + * or we hit an area that isn't mapped. + */ + while (unmapped < size) { + size_t left = size - unmapped; + + unmapped_page = domain->ops->unmap(domain, iova, left); + if (!unmapped_page) + break; + + pr_debug("unmapped: iova 0x%lx size %lx\n", iova, + (unsigned long)unmapped_page); + + iova += unmapped_page; + unmapped += unmapped_page; + } + + return unmapped; +} +EXPORT_SYMBOL_GPL(iommu_unmap); - BUG_ON(!IS_ALIGNED(iova, size)); +int iommu_device_group(struct device *dev, unsigned int *groupid) +{ + if (iommu_present(dev->bus) && dev->bus->iommu_ops->device_group) + return dev->bus->iommu_ops->device_group(dev, groupid); - return domain->ops->unmap(domain, iova, gfp_order); + return -ENODEV; } -EXPORT_SYMBOL_GPL(iommu_unmap); +EXPORT_SYMBOL_GPL(iommu_device_group); diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c index 5865dd2e28f9..08a90b88e40d 100644 --- a/drivers/iommu/msm_iommu.c +++ b/drivers/iommu/msm_iommu.c @@ -42,6 +42,9 @@ __asm__ __volatile__ ( \ #define RCP15_PRRR(reg) MRC(reg, p15, 0, c10, c2, 0) #define RCP15_NMRR(reg) MRC(reg, p15, 0, c10, c2, 1) +/* bitmap of the page sizes currently supported */ +#define MSM_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M) + static int msm_iommu_tex_class[4]; DEFINE_SPINLOCK(msm_iommu_lock); @@ -352,7 +355,7 @@ fail: } static int msm_iommu_map(struct iommu_domain *domain, unsigned long va, - phys_addr_t pa, int order, int prot) + phys_addr_t pa, size_t len, int prot) { struct msm_priv *priv; unsigned long flags; @@ -363,7 +366,6 @@ static int msm_iommu_map(struct iommu_domain *domain, unsigned long va, unsigned long *sl_pte; unsigned long sl_offset; unsigned int pgprot; - size_t len = 0x1000UL << order; int ret = 0, tex, sh; spin_lock_irqsave(&msm_iommu_lock, flags); @@ -463,8 +465,8 @@ fail: return ret; } -static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, - int order) +static size_t msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, + size_t len) { struct msm_priv *priv; unsigned long flags; @@ -474,7 +476,6 @@ static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, unsigned long *sl_table; unsigned long *sl_pte; unsigned long sl_offset; - size_t len = 0x1000UL << order; int i, ret = 0; spin_lock_irqsave(&msm_iommu_lock, flags); @@ -544,15 +545,12 @@ static int msm_iommu_unmap(struct iommu_domain *domain, unsigned long va, ret = __flush_iotlb(domain); - /* - * the IOMMU API requires us to return the order of the unmapped - * page (on success). - */ - if (!ret) - ret = order; fail: spin_unlock_irqrestore(&msm_iommu_lock, flags); - return ret; + + /* the IOMMU API requires us to return how many bytes were unmapped */ + len = ret ? 0 : len; + return len; } static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain, @@ -684,7 +682,8 @@ static struct iommu_ops msm_iommu_ops = { .map = msm_iommu_map, .unmap = msm_iommu_unmap, .iova_to_phys = msm_iommu_iova_to_phys, - .domain_has_cap = msm_iommu_domain_has_cap + .domain_has_cap = msm_iommu_domain_has_cap, + .pgsize_bitmap = MSM_IOMMU_PGSIZES, }; static int __init get_tex_class(int icp, int ocp, int mt, int nos) diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 8f32b2bf7587..d8edd979d01b 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -33,6 +33,9 @@ (__i < (n)) && (cr = __iotlb_read_cr((obj), __i), true); \ __i++) +/* bitmap of the page sizes currently supported */ +#define OMAP_IOMMU_PGSIZES (SZ_4K | SZ_64K | SZ_1M | SZ_16M) + /** * struct omap_iommu_domain - omap iommu domain * @pgtable: the page table @@ -86,20 +89,24 @@ EXPORT_SYMBOL_GPL(omap_uninstall_iommu_arch); /** * omap_iommu_save_ctx - Save registers for pm off-mode support - * @obj: target iommu + * @dev: client device **/ -void omap_iommu_save_ctx(struct omap_iommu *obj) +void omap_iommu_save_ctx(struct device *dev) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); + arch_iommu->save_ctx(obj); } EXPORT_SYMBOL_GPL(omap_iommu_save_ctx); /** * omap_iommu_restore_ctx - Restore registers for pm off-mode support - * @obj: target iommu + * @dev: client device **/ -void omap_iommu_restore_ctx(struct omap_iommu *obj) +void omap_iommu_restore_ctx(struct device *dev) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); + arch_iommu->restore_ctx(obj); } EXPORT_SYMBOL_GPL(omap_iommu_restore_ctx); @@ -820,35 +827,23 @@ static int device_match_by_alias(struct device *dev, void *data) } /** - * omap_find_iommu_device() - find an omap iommu device by name - * @name: name of the iommu device - * - * The generic iommu API requires the caller to provide the device - * he wishes to attach to a certain iommu domain. - * - * Drivers generally should not bother with this as it should just - * be taken care of by the DMA-API using dev_archdata. - * - * This function is provided as an interim solution until the latter - * materializes, and omap3isp is fully migrated to the DMA-API. - */ -struct device *omap_find_iommu_device(const char *name) -{ - return driver_find_device(&omap_iommu_driver.driver, NULL, - (void *)name, - device_match_by_alias); -} -EXPORT_SYMBOL_GPL(omap_find_iommu_device); - -/** * omap_iommu_attach() - attach iommu device to an iommu domain - * @dev: target omap iommu device + * @name: name of target omap iommu device * @iopgd: page table **/ -static struct omap_iommu *omap_iommu_attach(struct device *dev, u32 *iopgd) +static struct omap_iommu *omap_iommu_attach(const char *name, u32 *iopgd) { int err = -ENOMEM; - struct omap_iommu *obj = to_iommu(dev); + struct device *dev; + struct omap_iommu *obj; + + dev = driver_find_device(&omap_iommu_driver.driver, NULL, + (void *)name, + device_match_by_alias); + if (!dev) + return NULL; + + obj = to_iommu(dev); spin_lock(&obj->iommu_lock); @@ -1019,12 +1014,11 @@ static void iopte_cachep_ctor(void *iopte) } static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, - phys_addr_t pa, int order, int prot) + phys_addr_t pa, size_t bytes, int prot) { struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu *oiommu = omap_domain->iommu_dev; struct device *dev = oiommu->dev; - size_t bytes = PAGE_SIZE << order; struct iotlb_entry e; int omap_pgsz; u32 ret, flags; @@ -1049,19 +1043,16 @@ static int omap_iommu_map(struct iommu_domain *domain, unsigned long da, return ret; } -static int omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, - int order) +static size_t omap_iommu_unmap(struct iommu_domain *domain, unsigned long da, + size_t size) { struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu *oiommu = omap_domain->iommu_dev; struct device *dev = oiommu->dev; - size_t unmap_size; - - dev_dbg(dev, "unmapping da 0x%lx order %d\n", da, order); - unmap_size = iopgtable_clear_entry(oiommu, da); + dev_dbg(dev, "unmapping da 0x%lx size %u\n", da, size); - return unmap_size ? get_order(unmap_size) : -EINVAL; + return iopgtable_clear_entry(oiommu, da); } static int @@ -1069,6 +1060,7 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) { struct omap_iommu_domain *omap_domain = domain->priv; struct omap_iommu *oiommu; + struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; int ret = 0; spin_lock(&omap_domain->lock); @@ -1081,14 +1073,14 @@ omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) } /* get a handle to and enable the omap iommu */ - oiommu = omap_iommu_attach(dev, omap_domain->pgtable); + oiommu = omap_iommu_attach(arch_data->name, omap_domain->pgtable); if (IS_ERR(oiommu)) { ret = PTR_ERR(oiommu); dev_err(dev, "can't get omap iommu: %d\n", ret); goto out; } - omap_domain->iommu_dev = oiommu; + omap_domain->iommu_dev = arch_data->iommu_dev = oiommu; oiommu->domain = domain; out: @@ -1100,7 +1092,8 @@ static void omap_iommu_detach_dev(struct iommu_domain *domain, struct device *dev) { struct omap_iommu_domain *omap_domain = domain->priv; - struct omap_iommu *oiommu = to_iommu(dev); + struct omap_iommu_arch_data *arch_data = dev->archdata.iommu; + struct omap_iommu *oiommu = dev_to_omap_iommu(dev); spin_lock(&omap_domain->lock); @@ -1114,7 +1107,7 @@ static void omap_iommu_detach_dev(struct iommu_domain *domain, omap_iommu_detach(oiommu); - omap_domain->iommu_dev = NULL; + omap_domain->iommu_dev = arch_data->iommu_dev = NULL; out: spin_unlock(&omap_domain->lock); @@ -1183,14 +1176,14 @@ static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain, else if (iopte_is_large(*pte)) ret = omap_iommu_translate(*pte, da, IOLARGE_MASK); else - dev_err(dev, "bogus pte 0x%x", *pte); + dev_err(dev, "bogus pte 0x%x, da 0x%lx", *pte, da); } else { if (iopgd_is_section(*pgd)) ret = omap_iommu_translate(*pgd, da, IOSECTION_MASK); else if (iopgd_is_super(*pgd)) ret = omap_iommu_translate(*pgd, da, IOSUPER_MASK); else - dev_err(dev, "bogus pgd 0x%x", *pgd); + dev_err(dev, "bogus pgd 0x%x, da 0x%lx", *pgd, da); } return ret; @@ -1211,6 +1204,7 @@ static struct iommu_ops omap_iommu_ops = { .unmap = omap_iommu_unmap, .iova_to_phys = omap_iommu_iova_to_phys, .domain_has_cap = omap_iommu_domain_has_cap, + .pgsize_bitmap = OMAP_IOMMU_PGSIZES, }; static int __init omap_iommu_init(void) diff --git a/drivers/iommu/omap-iovmm.c b/drivers/iommu/omap-iovmm.c index 46be456fcc00..2e10c3e0a7ae 100644 --- a/drivers/iommu/omap-iovmm.c +++ b/drivers/iommu/omap-iovmm.c @@ -231,12 +231,14 @@ static struct iovm_struct *__find_iovm_area(struct omap_iommu *obj, /** * omap_find_iovm_area - find iovma which includes @da + * @dev: client device * @da: iommu device virtual address * * Find the existing iovma starting at @da */ -struct iovm_struct *omap_find_iovm_area(struct omap_iommu *obj, u32 da) +struct iovm_struct *omap_find_iovm_area(struct device *dev, u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); struct iovm_struct *area; mutex_lock(&obj->mmap_lock); @@ -343,14 +345,15 @@ static void free_iovm_area(struct omap_iommu *obj, struct iovm_struct *area) /** * omap_da_to_va - convert (d) to (v) - * @obj: objective iommu + * @dev: client device * @da: iommu device virtual address * @va: mpu virtual address * * Returns mpu virtual addr which corresponds to a given device virtual addr */ -void *omap_da_to_va(struct omap_iommu *obj, u32 da) +void *omap_da_to_va(struct device *dev, u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); void *va = NULL; struct iovm_struct *area; @@ -410,7 +413,6 @@ static int map_iovm_area(struct iommu_domain *domain, struct iovm_struct *new, unsigned int i, j; struct scatterlist *sg; u32 da = new->da_start; - int order; if (!domain || !sgt) return -EINVAL; @@ -429,12 +431,10 @@ static int map_iovm_area(struct iommu_domain *domain, struct iovm_struct *new, if (bytes_to_iopgsz(bytes) < 0) goto err_out; - order = get_order(bytes); - pr_debug("%s: [%d] %08x %08x(%x)\n", __func__, i, da, pa, bytes); - err = iommu_map(domain, da, pa, order, flags); + err = iommu_map(domain, da, pa, bytes, flags); if (err) goto err_out; @@ -449,10 +449,9 @@ err_out: size_t bytes; bytes = sg->length + sg->offset; - order = get_order(bytes); /* ignore failures.. we're already handling one */ - iommu_unmap(domain, da, order); + iommu_unmap(domain, da, bytes); da += bytes; } @@ -467,7 +466,8 @@ static void unmap_iovm_area(struct iommu_domain *domain, struct omap_iommu *obj, size_t total = area->da_end - area->da_start; const struct sg_table *sgt = area->sgt; struct scatterlist *sg; - int i, err; + int i; + size_t unmapped; BUG_ON(!sgtable_ok(sgt)); BUG_ON((!total) || !IS_ALIGNED(total, PAGE_SIZE)); @@ -475,13 +475,11 @@ static void unmap_iovm_area(struct iommu_domain *domain, struct omap_iommu *obj, start = area->da_start; for_each_sg(sgt->sgl, sg, sgt->nents, i) { size_t bytes; - int order; bytes = sg->length + sg->offset; - order = get_order(bytes); - err = iommu_unmap(domain, start, order); - if (err < 0) + unmapped = iommu_unmap(domain, start, bytes); + if (unmapped < bytes) break; dev_dbg(obj->dev, "%s: unmap %08x(%x) %08x\n", @@ -582,16 +580,18 @@ __iommu_vmap(struct iommu_domain *domain, struct omap_iommu *obj, /** * omap_iommu_vmap - (d)-(p)-(v) address mapper - * @obj: objective iommu + * @domain: iommu domain + * @dev: client device * @sgt: address of scatter gather table * @flags: iovma and page property * * Creates 1-n-1 mapping with given @sgt and returns @da. * All @sgt element must be io page size aligned. */ -u32 omap_iommu_vmap(struct iommu_domain *domain, struct omap_iommu *obj, u32 da, +u32 omap_iommu_vmap(struct iommu_domain *domain, struct device *dev, u32 da, const struct sg_table *sgt, u32 flags) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); size_t bytes; void *va = NULL; @@ -622,15 +622,17 @@ EXPORT_SYMBOL_GPL(omap_iommu_vmap); /** * omap_iommu_vunmap - release virtual mapping obtained by 'omap_iommu_vmap()' - * @obj: objective iommu + * @domain: iommu domain + * @dev: client device * @da: iommu device virtual address * * Free the iommu virtually contiguous memory area starting at * @da, which was returned by 'omap_iommu_vmap()'. */ struct sg_table * -omap_iommu_vunmap(struct iommu_domain *domain, struct omap_iommu *obj, u32 da) +omap_iommu_vunmap(struct iommu_domain *domain, struct device *dev, u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); struct sg_table *sgt; /* * 'sgt' is allocated before 'omap_iommu_vmalloc()' is called. @@ -647,7 +649,7 @@ EXPORT_SYMBOL_GPL(omap_iommu_vunmap); /** * omap_iommu_vmalloc - (d)-(p)-(v) address allocator and mapper - * @obj: objective iommu + * @dev: client device * @da: contiguous iommu virtual memory * @bytes: allocation size * @flags: iovma and page property @@ -656,9 +658,10 @@ EXPORT_SYMBOL_GPL(omap_iommu_vunmap); * @da again, which might be adjusted if 'IOVMF_DA_FIXED' is not set. */ u32 -omap_iommu_vmalloc(struct iommu_domain *domain, struct omap_iommu *obj, u32 da, +omap_iommu_vmalloc(struct iommu_domain *domain, struct device *dev, u32 da, size_t bytes, u32 flags) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); void *va; struct sg_table *sgt; @@ -698,15 +701,16 @@ EXPORT_SYMBOL_GPL(omap_iommu_vmalloc); /** * omap_iommu_vfree - release memory allocated by 'omap_iommu_vmalloc()' - * @obj: objective iommu + * @dev: client device * @da: iommu device virtual address * * Frees the iommu virtually continuous memory area starting at * @da, as obtained from 'omap_iommu_vmalloc()'. */ -void omap_iommu_vfree(struct iommu_domain *domain, struct omap_iommu *obj, +void omap_iommu_vfree(struct iommu_domain *domain, struct device *dev, const u32 da) { + struct omap_iommu *obj = dev_to_omap_iommu(dev); struct sg_table *sgt; sgt = unmap_vm_area(domain, obj, da, vfree, |