From 0cfb8f0c3e21e36d4a6e472e4c419d58ba848698 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Fri, 29 Aug 2014 15:18:31 -0700 Subject: memblock, memhotplug: fix wrong type in memblock_find_in_range_node(). In memblock_find_in_range_node(), we defined ret as int. But it should be phys_addr_t because it is used to store the return value from __memblock_find_range_bottom_up(). The bug has not been triggered because when allocating low memory near the kernel end, the "int ret" won't turn out to be negative. When we started to allocate memory on other nodes, and the "int ret" could be minus. Then the kernel will panic. A simple way to reproduce this: comment out the following code in numa_init(), memblock_set_bottom_up(false); and the kernel won't boot. Reported-by: Xishi Qiu Signed-off-by: Tang Chen Tested-by: Xishi Qiu Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 6d2f219a48b0..70fad0c0dafb 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -192,8 +192,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid) { - int ret; - phys_addr_t kernel_end; + phys_addr_t kernel_end, ret; /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) -- cgit v1.2.3 From ce8369bcbeeea6dfe24a6c8f60d2fcfce0432830 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 29 Aug 2014 15:18:33 -0700 Subject: mm: actually clear pmd_numa before invalidating Commit 67f87463d3a3 ("mm: clear pmd_numa before invalidating") cleared the NUMA bit in a copy of the PMD entry, but then wrote back the original Signed-off-by: Matthew Wilcox Acked-by: Mel Gorman Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a8b919925934..dfb79e028ecb 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -195,7 +195,7 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t entry = *pmdp; if (pmd_numa(entry)) entry = pmd_mknonnuma(entry); - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.2.3 From 137f8cff505ace6251dc442c7aa973d60c801a79 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 29 Aug 2014 15:18:40 -0700 Subject: mm/zpool: use prefixed module loading To avoid potential format string expansion via module parameters, do not use the zpool type directly in request_module() without a format string. Additionally, to avoid arbitrary modules being loaded via zpool API (e.g. via the zswap_zpool_type module parameter) add a "zpool-" prefix to the requested module, as well as module aliases for the existing zpool types (zbud and zsmalloc). Signed-off-by: Kees Cook Cc: Seth Jennings Cc: Minchan Kim Cc: Nitin Gupta Acked-by: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 1 + mm/zpool.c | 2 +- mm/zsmalloc.c | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index a05790b1915e..f26e7fcc7fa2 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -195,6 +195,7 @@ static struct zpool_driver zbud_zpool_driver = { .total_size = zbud_zpool_total_size, }; +MODULE_ALIAS("zpool-zbud"); #endif /* CONFIG_ZPOOL */ /***************** diff --git a/mm/zpool.c b/mm/zpool.c index e40612a1df00..739cdf0d183a 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -150,7 +150,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) driver = zpool_get_driver(type); if (!driver) { - request_module(type); + request_module("zpool-%s", type); driver = zpool_get_driver(type); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4e2fc83cb394..94f38fac5e81 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -315,6 +315,7 @@ static struct zpool_driver zs_zpool_driver = { .total_size = zs_zpool_total_size, }; +MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ -- cgit v1.2.3 From 7ea8574e5fa31f43d8098a028f12ba6a9c9f3530 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 29 Aug 2014 15:18:42 -0700 Subject: hugetlb_cgroup: use lockdep_assert_held rather than spin_is_locked spin_lock may be an empty struct for !SMP configurations and so arch_spin_is_locked may return unconditional 0 and trigger the VM_BUG_ON even when the lock is held. Replace spin_is_locked by lockdep_assert_held. We will not BUG anymore but it is questionable whether crashing makes a lot of sense in the uncharge path. Uncharge happens after the last page reference was released so nobody should touch the page and the function doesn't update any shared state except for res counter which uses synchronization of its own. Signed-off-by: Michal Hocko Reviewed-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9eebfadeeee1..a67c26e0f360 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -217,7 +217,7 @@ void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled()) return; - VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); + lockdep_assert_held(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(page); if (unlikely(!h_cg)) return; -- cgit v1.2.3 From b38af4721f59d0b564468f623b3e52a638195015 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Aug 2014 15:18:44 -0700 Subject: x86,mm: fix pte_special versus pte_numa Sasha Levin has shown oopses on ffffea0003480048 and ffffea0003480008 at mm/memory.c:1132, running Trinity on different 3.16-rc-next kernels: where zap_pte_range() checks page->mapping to see if PageAnon(page). Those addresses fit struct pages for pfns d2001 and d2000, and in each dump a register or a stack slot showed d2001730 or d2000730: pte flags 0x730 are PCD ACCESSED PROTNONE SPECIAL IOMAP; and Sasha's e820 map has a hole between cfffffff and 100000000, which would need special access. Commit c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels") has broken vm_normal_page(): a PROTNONE SPECIAL pte no longer passes the pte_special() test, so zap_pte_range() goes on to try to access a non-existent struct page. Fix this by refining pte_special() (SPECIAL with PRESENT or PROTNONE) to complement pte_numa() (SPECIAL with neither PRESENT nor PROTNONE). A hint that this was a problem was that c46a7c817e66 added pte_numa() test to vm_normal_page(), and moved its is_zero_pfn() test from slow to fast path: This was papering over a pte_special() snag when the zero page was encountered during zap. This patch reverts vm_normal_page() to how it was before, relying on pte_special(). It still appears that this patch may be incomplete: aren't there other places which need to be handling PROTNONE along with PRESENT? For example, pte_mknuma() clears _PAGE_PRESENT and sets _PAGE_NUMA, but on a PROT_NONE area, that would make it pte_special(). This is side-stepped by the fact that NUMA hinting faults skipped PROT_NONE VMAs and there are no grounds where a NUMA hinting fault on a PROT_NONE VMA would be interesting. Fixes: c46a7c817e66 ("x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels") Reported-by: Sasha Levin Tested-by: Sasha Levin Signed-off-by: Hugh Dickins Signed-off-by: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Rik van Riel Cc: Johannes Weiner Cc: Cyrill Gorcunov Cc: Matthew Wilcox Cc: [3.16] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index ab3537bcfed2..adeac306610f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -751,7 +751,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte) || pte_numa(pte))) + if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -777,15 +777,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } - if (is_zero_pfn(pfn)) - return NULL; - /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. -- cgit v1.2.3