From 1b028f784e8c341e762c264f70dc0ca1418c8b7a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 6 Mar 2017 17:17:19 +0300 Subject: x86/mm: Introduce mmap_compat_base() for 32-bit mmap() mmap() uses a base address, from which it starts to look for a free space for allocation. The base address is stored in mm->mmap_base, which is calculated during exec(). The address depends on task's size, set rlimit for stack, ASLR randomization. The base depends on the task size and the number of random bits which are different for 64-bit and 32bit applications. Due to the fact, that the base address is fixed, its mmap() from a compat (32bit) syscall issued by a 64bit task will return a address which is based on the 64bit base address and does not fit into the 32bit address space (4GB). The returned pointer is truncated to 32bit, which results in an invalid address. To solve store a seperate compat address base plus a compat legacy address base in mm_struct. These bases are calculated at exec() time and can be used later to address the 32bit compat mmap() issued by 64 bit applications. As a consequence of this change 32-bit applications issuing a 64-bit syscall (after doing a long jump) will get a 64-bit mapping now. Before this change 32-bit applications always got a 32bit mapping. [ tglx: Massaged changelog and added a comment ] Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170306141721.9188-4-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f60f45fe226f..45cdb27791a3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -367,6 +367,11 @@ struct mm_struct { #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* Base adresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; +#endif unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; -- cgit v1.2.3 From 9a804fecee232e71b47ac37d62fd3d5d66b08b91 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:49 +0300 Subject: mm/gup: Drop the arch_pte_access_permitted() MMU callback The only arch that defines it to something meaningful is x86. But x86 doesn't use the generic GUP_fast() implementation -- the only place where the callback is called. Let's drop it. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/asm-generic/mm_hooks.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index cc5d9a1405df..41e5b6784b97 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* _ASM_GENERIC_MM_HOOKS_H */ -- cgit v1.2.3 From e7884f8ead4a301b04687a3238527b06feef8ea0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:50 +0300 Subject: mm/gup: Move permission checks into helpers This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. On x86, we would need to do additional permission checks to determine if access is allowed. Let's abstract it out into separate helpers. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1fad160f35de..7dfa767dc680 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte) } #endif +#ifndef pte_access_permitted +#define pte_access_permitted(pte, write) \ + (pte_present(pte) && (!(write) || pte_write(pte))) +#endif + +#ifndef pmd_access_permitted +#define pmd_access_permitted(pmd, write) \ + (pmd_present(pmd) && (!(write) || pmd_write(pmd))) +#endif + +#ifndef pud_access_permitted +#define pud_access_permitted(pud, write) \ + (pud_present(pud) && (!(write) || pud_write(pud))) +#endif + +#ifndef p4d_access_permitted +#define p4d_access_permitted(p4d, write) \ + (p4d_present(p4d) && (!(write) || p4d_write(p4d))) +#endif + +#ifndef pgd_access_permitted +#define pgd_access_permitted(pgd, write) \ + (pgd_present(pgd) && (!(write) || pgd_write(pgd))) +#endif + #ifndef __HAVE_ARCH_PMD_SAME #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) -- cgit v1.2.3 From b59f65fa076a8eac2ff3a8ab7f8e1705b9fa86cb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:53 +0300 Subject: mm/gup: Implement the dev_pagemap() logic in the generic get_user_pages_fast() function This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. Prepare generic GUP_fast() to handle dev_pagemap(). At the moment, it's only implemented on x86. On non-x86, the new code will be compiled out. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dan Williams Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/mm.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f01c88f0800..e197d3ca3e8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -430,6 +430,10 @@ static inline int pud_devmap(pud_t pud) { return 0; } +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif /* -- cgit v1.2.3 From f2a6a7050109e0a5c7a84c70aa6010f682b2f1ee Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 21:55:15 +0300 Subject: x86: Convert the rest of the code to support p4d_t This patch converts x86 to use proper folding of a new (fifth) page table level with . That's a bit of a kitchen sink patch, but I don't see how to split it further without hurting bisectability. Signed-off-by: Kirill A. Shutemov Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-7-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/trace/events/xen.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index bce990f5a35d..31acce9019a6 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud, (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) ); -TRACE_EVENT(xen_mmu_set_pgd, - TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval), - TP_ARGS(pgdp, user_pgdp, pgdval), +TRACE_EVENT(xen_mmu_set_p4d, + TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval), + TP_ARGS(p4dp, user_p4dp, p4dval), TP_STRUCT__entry( - __field(pgd_t *, pgdp) - __field(pgd_t *, user_pgdp) - __field(pgdval_t, pgdval) - ), - TP_fast_assign(__entry->pgdp = pgdp; - __entry->user_pgdp = user_pgdp; - __entry->pgdval = pgdval.pgd), - TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)", - __entry->pgdp, __entry->user_pgdp, - (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)), - (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval) + __field(p4d_t *, p4dp) + __field(p4d_t *, user_p4dp) + __field(p4dval_t, p4dval) + ), + TP_fast_assign(__entry->p4dp = p4dp; + __entry->user_p4dp = user_p4dp; + __entry->p4dval = p4d_val(p4dval)), + TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)", + __entry->p4dp, __entry->user_p4dp, + (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)), + (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval) ); TRACE_EVENT(xen_mmu_pud_clear, -- cgit v1.2.3 From 591a3d7c09fa08baff48ad86c2347dbd28a52753 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Mar 2017 14:13:05 +0300 Subject: mm: Fix false-positive VM_BUG_ON() in page_cache_{get,add}_speculative() 0day testing by Fengguang Wu triggered this crash while running Trinity: kernel BUG at include/linux/pagemap.h:151! ... CPU: 0 PID: 458 Comm: trinity-c0 Not tainted 4.11.0-rc2-00251-g2947ba0 #1 ... Call Trace: __get_user_pages_fast() get_user_pages_fast() get_futex_key() futex_requeue() do_futex() SyS_futex() do_syscall_64() entry_SYSCALL64_slow_path() It' VM_BUG_ON() due to false-negative in_atomic(). We call page_cache_get_speculative() with disabled local interrupts. It should be atomic enough. So let's check for disabled interrupts in the VM_BUG_ON() condition too, to resolve this. ( This got triggered by the conversion of the x86 GUP code to the generic GUP code. ) Reported-by: Fengguang Wu Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170324114709.pcytvyb3d6ajux33@black.fi.intel.com Signed-off-by: Ingo Molnar --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 84943e8057ef..316a19f6b635 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page) #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing @@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); page_ref_add(page, count); -- cgit v1.2.3 From 71389703839ebe9cb426c72d5f0bd549592e583c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Apr 2017 10:23:37 -0700 Subject: mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov Tested-by: Kirill Shutemov Signed-off-by: Dan Williams Reviewed-by: Logan Gunthorpe Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Jérôme Glisse Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- include/linux/mm.h | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index a835edd2db34..695da2a19b4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -762,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void get_zone_device_page(struct page *page); -void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else -static inline void get_zone_device_page(struct page *page) -{ -} -static inline void put_zone_device_page(struct page *page) -{ -} static inline bool is_zone_device_page(const struct page *page) { return false; @@ -790,9 +782,6 @@ static inline void get_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); - - if (unlikely(is_zone_device_page(page))) - get_zone_device_page(page); } static inline void put_page(struct page *page) @@ -801,9 +790,6 @@ static inline void put_page(struct page *page) if (put_page_testzero(page)) __put_page(page); - - if (unlikely(is_zone_device_page(page))) - put_zone_device_page(page); } #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) -- cgit v1.2.3