summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/gup.c8
-rw-r--r--mm/huge_memory.c8
-rw-r--r--mm/hugetlb.c83
-rw-r--r--mm/kasan/common.c65
-rw-r--r--mm/kasan/init.c2
-rw-r--r--mm/memory-failure.c16
-rw-r--r--mm/memory.c43
-rw-r--r--mm/migrate.c15
-rw-r--r--mm/mincore.c4
-rw-r--r--mm/mremap.c66
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/slab.c6
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap.c2
-rw-r--r--mm/usercopy.c9
-rw-r--r--mm/userfaultfd.c13
-rw-r--r--mm/util.c2
20 files changed, 208 insertions, 158 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 29655fb47a2c..9f5e323e883e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1125,7 +1125,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
break;
}
- if (unlikely(signal_pending_state(state, current))) {
+ if (signal_pending_state(state, current)) {
ret = -EINTR;
break;
}
diff --git a/mm/gup.c b/mm/gup.c
index 8cb68a50dbdf..05acd7e2eb22 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -727,7 +727,7 @@ retry:
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
ret = -ERESTARTSYS;
goto out;
}
@@ -1813,8 +1813,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
+ if (unlikely(!access_ok((void __user *)start, len)))
return 0;
/*
@@ -1868,8 +1867,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
if (nr_pages <= 0)
return 0;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
+ if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
if (gup_fast_permitted(start, nr_pages, write)) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cbd977b1d60d..faf357eaf0ce 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -568,7 +568,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
return VM_FAULT_FALLBACK;
}
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
ret = VM_FAULT_OOM;
goto release;
@@ -702,7 +702,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
struct page *zero_page;
bool set;
vm_fault_t ret;
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = mm_get_huge_zero_page(vma->vm_mm);
@@ -791,7 +791,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
return VM_FAULT_SIGBUS;
if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm, addr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (!pgtable)
return VM_FAULT_OOM;
}
@@ -927,7 +927,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!vma_is_anonymous(vma))
return 0;
- pgtable = pte_alloc_one(dst_mm, addr);
+ pgtable = pte_alloc_one(dst_mm);
if (unlikely(!pgtable))
goto out;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e37efd5d8318..df2e7dd5ff17 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3238,7 +3238,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
struct page *ptepage;
unsigned long addr;
int cow;
- struct address_space *mapping = vma->vm_file->f_mapping;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
@@ -3250,23 +3249,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
mmu_notifier_range_init(&range, src, vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
- } else {
- /*
- * For shared mappings i_mmap_rwsem must be held to call
- * huge_pte_alloc, otherwise the returned ptep could go
- * away if part of a shared pmd and another thread calls
- * huge_pmd_unshare.
- */
- i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
-
src_pte = huge_pte_offset(src, addr, sz);
if (!src_pte)
continue;
-
dst_pte = huge_pte_alloc(dst, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -3337,8 +3326,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (cow)
mmu_notifier_invalidate_range_end(&range);
- else
- i_mmap_unlock_read(mapping);
return ret;
}
@@ -3755,16 +3742,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
}
/*
- * We can not race with truncation due to holding i_mmap_rwsem.
- * Check once here for faults beyond end of file.
+ * Use page lock to guard against racing truncation
+ * before we get page_table_lock.
*/
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
retry:
page = find_lock_page(mapping, idx);
if (!page) {
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
/*
* Check for page in userfault range
*/
@@ -3784,18 +3771,14 @@ retry:
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * hugetlb_fault_mutex must be dropped before
+ * handling userfault. Reacquire after handling
+ * fault to make calling code simpler.
*/
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
-
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-
- i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -3854,6 +3837,9 @@ retry:
}
ptl = huge_pte_lock(h, mm, ptep);
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto backout;
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
@@ -3940,11 +3926,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
- /*
- * Since we hold no locks, ptep could be stale. That is
- * OK as we are only making decisions based on content and
- * not actually modifying content here.
- */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -3952,33 +3933,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
+ } else {
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep)
+ return VM_FAULT_OOM;
}
- /*
- * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
- * until finished with ptep. This serves two purposes:
- * 1) It prevents huge_pmd_unshare from being called elsewhere
- * and making the ptep no longer valid.
- * 2) It synchronizes us with file truncation.
- *
- * ptep could have already be assigned via huge_pte_offset. That
- * is OK, as huge_pte_alloc will return the same value unless
- * something changed.
- */
mapping = vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep) {
- i_mmap_unlock_read(mapping);
- return VM_FAULT_OOM;
- }
+ idx = vma_hugecache_offset(h, vma, haddr);
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- idx = vma_hugecache_offset(h, vma, haddr);
hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -4066,7 +4034,6 @@ out_ptl:
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4231,7 +4198,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
remainder = 0;
break;
}
@@ -4671,12 +4638,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode.
- * For hugetlbfs, this prevents removal of any page table entries associated
- * with the address space. This is important as we are setting up sharing
- * based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4693,6 +4658,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_lock_write(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4722,6 +4688,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
+ i_mmap_unlock_write(mapping);
return pte;
}
@@ -4732,7 +4699,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 03d5d1374ca7..73c9cbfdedf4 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -298,8 +298,6 @@ void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
return;
}
- cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
-
*flags |= SLAB_KASAN;
}
@@ -349,28 +347,43 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
}
/*
- * Since it's desirable to only call object contructors once during slab
- * allocation, we preassign tags to all such objects. Also preassign tags for
- * SLAB_TYPESAFE_BY_RCU slabs to avoid use-after-free reports.
- * For SLAB allocator we can't preassign tags randomly since the freelist is
- * stored as an array of indexes instead of a linked list. Assign tags based
- * on objects indexes, so that objects that are next to each other get
- * different tags.
- * After a tag is assigned, the object always gets allocated with the same tag.
- * The reason is that we can't change tags for objects with constructors on
- * reallocation (even for non-SLAB_TYPESAFE_BY_RCU), because the constructor
- * code can save the pointer to the object somewhere (e.g. in the object
- * itself). Then if we retag it, the old saved pointer will become invalid.
+ * This function assigns a tag to an object considering the following:
+ * 1. A cache might have a constructor, which might save a pointer to a slab
+ * object somewhere (e.g. in the object itself). We preassign a tag for
+ * each object in caches with constructors during slab creation and reuse
+ * the same tag each time a particular object is allocated.
+ * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
+ * accessed after being freed. We preassign tags for objects in these
+ * caches as well.
+ * 3. For SLAB allocator we can't preassign tags randomly since the freelist
+ * is stored as an array of indexes instead of a linked list. Assign tags
+ * based on objects indexes, so that objects that are next to each other
+ * get different tags.
*/
-static u8 assign_tag(struct kmem_cache *cache, const void *object, bool new)
+static u8 assign_tag(struct kmem_cache *cache, const void *object,
+ bool init, bool krealloc)
{
+ /* Reuse the same tag for krealloc'ed objects. */
+ if (krealloc)
+ return get_tag(object);
+
+ /*
+ * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
+ * set, assign a tag when the object is being allocated (init == false).
+ */
if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return new ? KASAN_TAG_KERNEL : random_tag();
+ return init ? KASAN_TAG_KERNEL : random_tag();
+ /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
+ /* For SLAB assign tags based on the object index in the freelist. */
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
#else
- return new ? random_tag() : get_tag(object);
+ /*
+ * For SLUB assign a random tag during slab creation, otherwise reuse
+ * the already assigned tag.
+ */
+ return init ? random_tag() : get_tag(object);
#endif
}
@@ -386,7 +399,8 @@ void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
__memset(alloc_info, 0, sizeof(*alloc_info));
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- object = set_tag(object, assign_tag(cache, object, true));
+ object = set_tag(object,
+ assign_tag(cache, object, true, false));
return (void *)object;
}
@@ -452,8 +466,8 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
return __kasan_slab_free(cache, object, ip, true);
}
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags)
+static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags, bool krealloc)
{
unsigned long redzone_start;
unsigned long redzone_end;
@@ -471,7 +485,7 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_SHADOW_SCALE_SIZE);
if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false);
+ tag = assign_tag(cache, object, false, krealloc);
/* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
kasan_unpoison_shadow(set_tag(object, tag), size);
@@ -483,6 +497,12 @@ void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
return set_tag(object, tag);
}
+
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, size, flags, false);
+}
EXPORT_SYMBOL(kasan_kmalloc);
void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
@@ -522,7 +542,8 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
if (unlikely(!PageSlab(page)))
return kasan_kmalloc_large(object, size, flags);
else
- return kasan_kmalloc(page->slab_cache, object, size, flags);
+ return __kasan_kmalloc(page->slab_cache, object, size,
+ flags, true);
}
void kasan_poison_kfree(void *ptr, unsigned long ip)
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index 34afad56497b..45a1b5e38e1e 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -123,7 +123,7 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr,
pte_t *p;
if (slab_is_available())
- p = pte_alloc_one_kernel(&init_mm, addr);
+ p = pte_alloc_one_kernel(&init_mm);
else
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
if (!p)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..7c72f2a95785 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -966,7 +966,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
struct address_space *mapping;
LIST_HEAD(tokill);
- bool unmap_success = true;
+ bool unmap_success;
int kill = 1, forcekill;
struct page *hpage = *hpagep;
bool mlocked = PageMlocked(hpage);
@@ -1028,19 +1028,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill)
collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED);
- if (!PageHuge(hpage)) {
- unmap_success = try_to_unmap(hpage, ttu);
- } else if (mapping) {
- /*
- * For hugetlb pages, try_to_unmap could potentially call
- * huge_pmd_unshare. Because of this, take semaphore in
- * write mode here and set TTU_RMAP_LOCKED to indicate we
- * have taken the lock at this higer level.
- */
- i_mmap_lock_write(mapping);
- unmap_success = try_to_unmap(hpage, ttu|TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
- }
+ unmap_success = try_to_unmap(hpage, ttu);
if (!unmap_success)
pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
diff --git a/mm/memory.c b/mm/memory.c
index 2dd2f9ab57f4..e11ca9dd823f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -400,10 +400,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- pgtable_t new = pte_alloc_one(mm, address);
+ pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
@@ -434,9 +434,9 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
return 0;
}
-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd)
{
- pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
return -ENOMEM;
@@ -2896,7 +2896,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
@@ -2994,6 +2994,28 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
@@ -3043,7 +3065,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf)
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
vmf->prealloc_pte = NULL;
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -3122,7 +3144,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
- vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3360,8 +3382,7 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
start_pgoff + nr_pages - 1);
if (pmd_none(*vmf->pmd)) {
- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
- vmf->address);
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -4078,8 +4099,8 @@ static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
goto out;
if (range) {
- range->start = address & PAGE_MASK;
- range->end = range->start + PAGE_SIZE;
+ mmu_notifier_range_init(range, mm, address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
diff --git a/mm/migrate.c b/mm/migrate.c
index 5d1839a9148d..a16b15090df3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1324,19 +1324,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
goto put_anon;
if (page_mapped(hpage)) {
- struct address_space *mapping = page_mapping(hpage);
-
- /*
- * try_to_unmap could potentially call huge_pmd_unshare.
- * Because of this, take semaphore in write mode here and
- * set TTU_RMAP_LOCKED to let lower levels know we have
- * taken the lock.
- */
- i_mmap_lock_write(mapping);
try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
- TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
page_was_mapped = 1;
}
@@ -2636,7 +2625,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(mm, pmdp, addr))
+ if (pte_alloc(mm, pmdp))
goto abort;
/* See the comment in pte_alloc_one_map() */
diff --git a/mm/mincore.c b/mm/mincore.c
index 4985965aa20a..218099b5ed31 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -233,14 +233,14 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
- if (!access_ok(VERIFY_READ, (void __user *) start, len))
+ if (!access_ok((void __user *) start, len))
return -ENOMEM;
/* This also avoids any overflows on PAGE_ALIGN */
pages = len >> PAGE_SHIFT;
pages += (offset_in_page(len)) != 0;
- if (!access_ok(VERIFY_WRITE, vec, pages))
+ if (!access_ok(vec, pages))
return -EFAULT;
tmp = (void *) __get_free_page(GFP_USER);
diff --git a/mm/mremap.c b/mm/mremap.c
index def01d86e36f..3320616ed93f 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -191,6 +191,52 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
drop_rmap_locks(vma);
}
+#ifdef CONFIG_HAVE_MOVE_PMD
+static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t pmd;
+
+ if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
+ || old_end - old_addr < PMD_SIZE)
+ return false;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_sem prevents deadlock.
+ */
+ old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pmd */
+ pmd = *old_pmd;
+ pmd_clear(old_pmd);
+
+ VM_BUG_ON(!pmd_none(*new_pmd));
+
+ /* Set the new pmd */
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#endif
+
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
@@ -235,8 +281,26 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
split_huge_pmd(vma, old_pmd, old_addr);
if (pmd_trans_unstable(old_pmd))
continue;
+ } else if (extent == PMD_SIZE) {
+#ifdef CONFIG_HAVE_MOVE_PMD
+ /*
+ * If the extent is PMD-sized, try to speed the move by
+ * moving at the PMD level if possible.
+ */
+ bool moved;
+
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
+ moved = move_normal_pmd(vma, old_addr, new_addr,
+ old_end, old_pmd, new_pmd);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
+ if (moved)
+ continue;
+#endif
}
- if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
+
+ if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cde5dac6229a..d295c9bc01a8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2214,7 +2214,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
*/
boost_watermark(zone);
if (alloc_flags & ALLOC_KSWAPD)
- wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
@@ -3102,6 +3102,12 @@ struct page *rmqueue(struct zone *preferred_zone,
local_irq_restore(flags);
out:
+ /* Separate test+clear to avoid unnecessary atomics */
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ }
+
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
diff --git a/mm/page_io.c b/mm/page_io.c
index d975fa3f02aa..2e8019d0e048 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -401,6 +401,8 @@ int swap_readpage(struct page *page, bool synchronous)
get_task_struct(current);
bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ if (synchronous)
+ bio->bi_opf |= REQ_HIPRI;
count_vm_event(PSWPIN);
bio_get(bio);
qc = submit_bio(bio);
@@ -410,7 +412,7 @@ int swap_readpage(struct page *page, bool synchronous)
break;
if (!blk_poll(disk->queue, qc, true))
- break;
+ io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
diff --git a/mm/rmap.c b/mm/rmap.c
index 21a26cf51114..0454ecc29537 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -25,7 +25,6 @@
* page->flags PG_locked (lock_page)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
- * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -1372,16 +1371,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- mmu_notifier_range_init(&range, vma->vm_mm, vma->vm_start,
- min(vma->vm_end, vma->vm_start +
+ mmu_notifier_range_init(&range, vma->vm_mm, address,
+ min(vma->vm_end, address +
(PAGE_SIZE << compound_order(page))));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
- *
- * If called for a huge page, caller must hold i_mmap_rwsem
- * in write mode as it is possible to call huge_pmd_unshare.
*/
adjust_range_if_pmd_sharing_possible(vma, &range.start,
&range.end);
diff --git a/mm/slab.c b/mm/slab.c
index 73fe23e649c9..78eb8c5bf4e4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -666,8 +666,10 @@ static struct alien_cache *__alloc_alien_cache(int node, int entries,
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
diff --git a/mm/slub.c b/mm/slub.c
index 36c0befeebd8..1e3d0ec4e200 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3846,6 +3846,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
unsigned int offset;
size_t object_size;
+ ptr = kasan_reset_tag(ptr);
+
/* Find object and usable object size. */
s = page->slab_cache;
diff --git a/mm/swap.c b/mm/swap.c
index 4d8a1f1afaab..4929bc1be60e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -126,7 +126,7 @@ void put_pages_list(struct list_head *pages)
while (!list_empty(pages)) {
struct page *victim;
- victim = list_entry(pages->prev, struct page, lru);
+ victim = lru_to_page(pages);
list_del(&victim->lru);
put_page(victim);
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e53f06..14faadcedd06 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -247,7 +247,8 @@ static DEFINE_STATIC_KEY_FALSE_RO(bypass_usercopy_checks);
/*
* Validates that the given object is:
* - not bogus address
- * - known-safe heap or stack object
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +263,6 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
/* Check for invalid addresses. */
check_bogus_address((const unsigned long)ptr, n, to_user);
- /* Check for bad heap object. */
- check_heap_object(ptr, n, to_user);
-
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
case NOT_STACK:
@@ -282,6 +280,9 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user)
usercopy_abort("process stack", NULL, to_user, 0, n);
}
+ /* Check for bad heap object. */
+ check_heap_object(ptr, n, to_user);
+
/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 48368589f519..d59b5a73dfb3 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -267,14 +267,10 @@ retry:
VM_BUG_ON(dst_addr & ~huge_page_mask(h));
/*
- * Serialize via i_mmap_rwsem and hugetlb_fault_mutex.
- * i_mmap_rwsem ensures the dst_pte remains valid even
- * in the case of shared pmds. fault mutex prevents
- * races with other faulting threads.
+ * Serialize via hugetlb_fault_mutex
*/
- mapping = dst_vma->vm_file->f_mapping;
- i_mmap_lock_read(mapping);
idx = linear_page_index(dst_vma, dst_addr);
+ mapping = dst_vma->vm_file->f_mapping;
hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -283,7 +279,6 @@ retry:
dst_pte = huge_pte_alloc(dst_mm, dst_addr, huge_page_size(h));
if (!dst_pte) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -291,7 +286,6 @@ retry:
dst_pteval = huge_ptep_get(dst_pte);
if (!huge_pte_none(dst_pteval)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
goto out_unlock;
}
@@ -299,7 +293,6 @@ retry:
dst_addr, src_addr, &page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
- i_mmap_unlock_read(mapping);
vm_alloc_shared = vm_shared;
cond_resched();
@@ -550,7 +543,7 @@ retry:
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
diff --git a/mm/util.c b/mm/util.c
index 4df23d64aac7..1ea055138043 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -478,7 +478,7 @@ bool page_mapped(struct page *page)
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
+ for (i = 0; i < (1 << compound_order(page)); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}