diff options
Diffstat (limited to 'mm/memory.c')
-rw-r--r-- | mm/memory.c | 365 |
1 files changed, 277 insertions, 88 deletions
diff --git a/mm/memory.c b/mm/memory.c index 469af373ae76..fcfc4ca36eba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -695,84 +695,218 @@ out: * covered by this vma. */ -static inline unsigned long -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static unsigned long +copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); - /* pte contains position in swap or file, so copy. */ - if (unlikely(!pte_present(pte))) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (likely(!non_swap_entry(entry))) { - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - - rss[mm_counter(page)]++; - - if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both - * parent and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) - pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) - pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } - } else if (is_device_private_entry(entry)) { - page = device_private_entry_to_page(entry); + rss[mm_counter(page)]++; + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { /* - * Update rss count even for unaddressable pages, as - * they should treated just like normal pages in this - * respect. - * - * We will likely want to have some new rss counters - * for unaddressable pages, at some point. But for now - * keep things as they are. + * COW mappings require pages in both + * parent and child to be set to read. */ - get_page(page); - rss[mm_counter(page)]++; - page_dup_rmap(page, false); + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); + set_pte_at(src_mm, addr, src_pte, pte); + } + } else if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); - /* - * We do not preserve soft-dirty information, because so - * far, checkpoint/restore is the only feature that - * requires that. And checkpoint/restore does not work - * when a device driver is involved (you cannot easily - * save and restore device driver state). - */ - if (is_write_device_private_entry(entry) && - is_cow_mapping(vm_flags)) { - make_device_private_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) - pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + page_dup_rmap(page, false); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_write_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + make_device_private_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); + set_pte_at(src_mm, addr, src_pte, pte); } - goto out_set_pte; + } + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy a present and normal page if necessary. + * + * NOTE! The usual case is that this doesn't need to do + * anything, and can just return a positive value. That + * will let the caller know that it can just increase + * the page refcount and re-use the pte the traditional + * way. + * + * But _if_ we need to copy it because it needs to be + * pinned in the parent (and the child should get its own + * copy rather than just a reference to the same page), + * we'll do that here and return zero to let the caller + * know we're done. + * + * And if we need a pre-allocated page but don't yet have + * one, return a negative error to let the preallocation + * code know so that it can do so outside the page table + * lock. + */ +static inline int +copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, + struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc, + pte_t pte, struct page *page) +{ + struct page *new_page; + + if (!is_cow_mapping(vma->vm_flags)) + return 1; + + /* + * The trick starts. + * + * What we want to do is to check whether this page may + * have been pinned by the parent process. If so, + * instead of wrprotect the pte on both sides, we copy + * the page immediately so that we'll always guarantee + * the pinned page won't be randomly replaced in the + * future. + * + * To achieve this, we do the following: + * + * 1. Write-protect the pte if it's writable. This is + * to protect concurrent write fast-gup with + * FOLL_PIN, so that we'll fail the fast-gup with + * the write bit removed. + * + * 2. Check page_maybe_dma_pinned() to see whether this + * page may have been pinned. + * + * The order of these steps is important to serialize + * against the fast-gup code (gup_pte_range()) on the + * pte check and try_grab_compound_head(), so that + * we'll make sure either we'll capture that fast-gup + * so we'll copy the pinned page here, or we'll fail + * that fast-gup. + * + * NOTE! Even if we don't end up copying the page, + * we won't undo this wrprotect(), because the normal + * reference copy will need it anyway. + */ + if (pte_write(pte)) + ptep_set_wrprotect(src_mm, addr, src_pte); + + /* + * These are the "normally we can just copy by reference" + * checks. + */ + if (likely(!atomic_read(&src_mm->has_pinned))) + return 1; + if (likely(!page_maybe_dma_pinned(page))) + return 1; + + /* + * Uhhuh. It looks like the page might be a pinned page, + * and we actually need to copy it. Now we can set the + * source pte back to being writable. + */ + if (pte_write(pte)) + set_pte_at(src_mm, addr, src_pte, pte); + + new_page = *prealloc; + if (!new_page) + return -EAGAIN; + + /* + * We have a prealloc page, all good! Take it + * over and copy the page & arm it. + */ + *prealloc = NULL; + copy_user_highpage(new_page, page, addr, vma); + __SetPageUptodate(new_page); + page_add_new_anon_rmap(new_page, new, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, new); + rss[mm_counter(new_page)]++; + + /* All done, just insert the new page copy in the child */ + pte = mk_pte(new_page, new->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), new); + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page + * is required to copy this pte. + */ +static inline int +copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc) +{ + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + page = vm_normal_page(vma, addr, pte); + if (page) { + int retval; + + retval = copy_present_page(dst_mm, src_mm, + dst_pte, src_pte, + vma, new, + addr, rss, prealloc, + pte, page); + if (retval <= 0) + return retval; + + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; } /* @@ -800,35 +934,51 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - page = vm_normal_page(vma, addr, pte); - if (page) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } - -out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } +static inline struct page * +page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); + if (!new_page) + return NULL; + + if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + put_page(new_page); + return NULL; + } + cgroup_throttle_swaprate(new_page, GFP_KERNEL); + + return new_page; +} + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; + struct page *prealloc = NULL; again: + progress = 0; init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) - return -ENOMEM; + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -851,10 +1001,34 @@ again: progress++; continue; } - entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + if (unlikely(!pte_present(*src_pte))) { + entry.val = copy_nonpresent_pte(dst_mm, src_mm, + dst_pte, src_pte, vma, addr, rss); - if (entry.val) + if (entry.val) + break; + progress += 8; + continue; + } + /* copy_present_pte() will clear `*prealloc' if consumed */ + ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, new, addr, rss, &prealloc); + /* + * If we need a pre-allocated page for this pte, drop the + * locks, allocate, and try again. + */ + if (unlikely(ret == -EAGAIN)) break; + if (unlikely(prealloc)) { + /* + * pre-alloc page cannot be reused by next time so as + * to strictly follow mempolicy (e.g., alloc_page_vma() + * will allocate page according to address). This + * could only happen if one pinned pte changed. + */ + put_page(prealloc); + prealloc = NULL; + } progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -866,17 +1040,30 @@ again: cond_resched(); if (entry.val) { - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret) { + WARN_ON_ONCE(ret != -EAGAIN); + prealloc = page_copy_prealloc(src_mm, vma, addr); + if (!prealloc) return -ENOMEM; - progress = 0; + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; } if (addr != end) goto again; - return 0; +out: + if (unlikely(prealloc)) + put_page(prealloc); + return ret; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -903,7 +1090,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; @@ -911,6 +1098,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -937,7 +1125,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; @@ -945,6 +1133,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; @@ -959,14 +1148,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) + struct vm_area_struct *vma, struct vm_area_struct *new) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -1021,7 +1210,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + vma, new, addr, next))) { ret = -ENOMEM; break; } @@ -2955,8 +3144,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * page count reference, and the page is locked, * it's dark out, and we're wearing sunglasses. Hit it. */ - wp_page_reuse(vmf); unlock_page(page); + wp_page_reuse(vmf); return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { |