diff options
author | Matthew Wilcox <willy@linux.intel.com> | 2017-02-24 14:57:02 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-02-24 17:46:54 -0800 |
commit | a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517 (patch) | |
tree | 54d78e89c63e519cb9e00fdab9efbf3189ef2f5e /mm/memory.c | |
parent | a2d581675d485eb7188f521f36efc114639a3096 (diff) | |
download | linux-stable-a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517.tar.gz linux-stable-a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517.tar.bz2 linux-stable-a00cc7d9dd93d66a3fb83fc52aa57a4bec51c517.zip |
mm, x86: add support for PUD-sized transparent hugepages
The current transparent hugepage code only supports PMDs. This patch
adds support for transparent use of PUDs with DAX. It does not include
support for anonymous pages. x86 support code also added.
Most of this patch simply parallels the work that was done for huge
PMDs. The only major difference is how the new ->pud_entry method in
mm_walk works. The ->pmd_entry method replaces the ->pte_entry method,
whereas the ->pud_entry method works along with either ->pmd_entry or
->pte_entry. The pagewalk code takes care of locking the PUD before
calling ->pud_walk, so handlers do not need to worry whether the PUD is
stable.
[dave.jiang@intel.com: fix SMP x86 32bit build for native_pud_clear()]
Link: http://lkml.kernel.org/r/148719066814.31111.3239231168815337012.stgit@djiang5-desk3.ch.intel.com
[dave.jiang@intel.com: native_pud_clear missing on i386 build]
Link: http://lkml.kernel.org/r/148640375195.69754.3315433724330910314.stgit@djiang5-desk3.ch.intel.com
Link: http://lkml.kernel.org/r/148545059381.17912.8602162635537598445.stgit@djiang5-desk3.ch.intel.com
Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
Signed-off-by: Dave Jiang <dave.jiang@intel.com>
Tested-by: Alexander Kapshuk <alexander.kapshuk@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jan Kara <jack@suse.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Nilesh Choudhury <nilesh.choudhury@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 88 |
1 files changed, 81 insertions, 7 deletions
diff --git a/mm/memory.c b/mm/memory.c index e721e8eba570..41e2a2d4b2a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1001,7 +1001,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; - VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); + VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) @@ -1032,6 +1032,18 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src src_pud = pud_offset(src_pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { + int err; + + VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, vma); + err = copy_huge_pud(dst_mm, src_mm, + dst_pud, src_pud, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, @@ -1263,9 +1275,19 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_trans_huge(*pud) || pud_devmap(*pud)) { + if (next - addr != HPAGE_PUD_SIZE) { + VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma); + split_huge_pud(vma, pud, addr); + } else if (zap_huge_pud(tlb, vma, pud, addr)) + goto next; + /* fall through */ + } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); +next: + cond_resched(); } while (pud++, addr = next, addr != end); return addr; @@ -3490,6 +3512,30 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); } +static int create_huge_pud(struct vm_fault *vmf) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + +static int wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* No support for anonymous transparent PUD pages yet */ + if (vma_is_anonymous(vmf->vma)) + return VM_FAULT_FALLBACK; + if (vmf->vma->vm_ops->huge_fault) + return vmf->vma->vm_ops->huge_fault(vmf); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + return VM_FAULT_FALLBACK; +} + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3605,14 +3651,41 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, }; struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; - pud_t *pud; int ret; pgd = pgd_offset(mm, address); - pud = pud_alloc(mm, pgd, address); - if (!pud) + + vmf.pud = pud_alloc(mm, pgd, address); + if (!vmf.pud) return VM_FAULT_OOM; - vmf.pmd = pmd_alloc(mm, pud, address); + if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) { + vmf.flags |= FAULT_FLAG_SIZE_PUD; + ret = create_huge_pud(&vmf); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + pud_t orig_pud = *vmf.pud; + + barrier(); + if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + vmf.flags |= FAULT_FLAG_SIZE_PUD; + + /* NUMA case for anonymous PUDs would go here */ + + if (dirty && !pud_write(orig_pud)) { + ret = wp_huge_pud(&vmf, orig_pud); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } else { + huge_pud_set_accessed(&vmf, orig_pud); + return 0; + } + } + } + + vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { @@ -3743,13 +3816,14 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) */ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) { + spinlock_t *ptl; pmd_t *new = pmd_alloc_one(mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ - spin_lock(&mm->page_table_lock); + ptl = pud_lock(mm, pud); #ifndef __ARCH_HAS_4LEVEL_HACK if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); @@ -3763,7 +3837,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) } else /* Another has populated it */ pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); return 0; } #endif /* __PAGETABLE_PMD_FOLDED */ |