summaryrefslogtreecommitdiffstats
path: root/mm/khugepaged.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r--mm/khugepaged.c437
1 files changed, 298 insertions, 139 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 0ec69b96b497..6b9d39d65b73 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -55,6 +55,9 @@ enum scan_result {
SCAN_CGROUP_CHARGE_FAIL,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
+ SCAN_STORE_FAILED,
+ SCAN_COPY_MC,
+ SCAN_PAGE_FILLED,
};
#define CREATE_TRACE_POINTS
@@ -685,20 +688,21 @@ out:
return result;
}
-static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
- struct vm_area_struct *vma,
- unsigned long address,
- spinlock_t *ptl,
- struct list_head *compound_pagelist)
+static void __collapse_huge_page_copy_succeeded(pte_t *pte,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
{
- struct page *src_page, *tmp;
+ struct page *src_page;
+ struct page *tmp;
pte_t *_pte;
- for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
- _pte++, page++, address += PAGE_SIZE) {
- pte_t pteval = *_pte;
+ pte_t pteval;
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
+ _pte++, address += PAGE_SIZE) {
+ pteval = *_pte;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
- clear_user_highpage(page, address);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
if (is_zero_pfn(pte_pfn(pteval))) {
/*
@@ -710,7 +714,6 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
}
} else {
src_page = pte_page(pteval);
- copy_user_highpage(page, src_page, address, vma);
if (!PageCompound(src_page))
release_pte_page(src_page);
/*
@@ -737,6 +740,87 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
}
}
+static void __collapse_huge_page_copy_failed(pte_t *pte,
+ pmd_t *pmd,
+ pmd_t orig_pmd,
+ struct vm_area_struct *vma,
+ struct list_head *compound_pagelist)
+{
+ spinlock_t *pmd_ptl;
+
+ /*
+ * Re-establish the PMD to point to the original page table
+ * entry. Restoring PMD needs to be done prior to releasing
+ * pages. Since pages are still isolated and locked here,
+ * acquiring anon_vma_lock_write is unnecessary.
+ */
+ pmd_ptl = pmd_lock(vma->vm_mm, pmd);
+ pmd_populate(vma->vm_mm, pmd, pmd_pgtable(orig_pmd));
+ spin_unlock(pmd_ptl);
+ /*
+ * Release both raw and compound pages isolated
+ * in __collapse_huge_page_isolate.
+ */
+ release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
+}
+
+/*
+ * __collapse_huge_page_copy - attempts to copy memory contents from raw
+ * pages to a hugepage. Cleans up the raw pages if copying succeeds;
+ * otherwise restores the original page table and releases isolated raw pages.
+ * Returns SCAN_SUCCEED if copying succeeds, otherwise returns SCAN_COPY_MC.
+ *
+ * @pte: starting of the PTEs to copy from
+ * @page: the new hugepage to copy contents to
+ * @pmd: pointer to the new hugepage's PMD
+ * @orig_pmd: the original raw pages' PMD
+ * @vma: the original raw pages' virtual memory area
+ * @address: starting address to copy
+ * @ptl: lock on raw pages' PTEs
+ * @compound_pagelist: list that stores compound pages
+ */
+static int __collapse_huge_page_copy(pte_t *pte,
+ struct page *page,
+ pmd_t *pmd,
+ pmd_t orig_pmd,
+ struct vm_area_struct *vma,
+ unsigned long address,
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
+{
+ struct page *src_page;
+ pte_t *_pte;
+ pte_t pteval;
+ unsigned long _address;
+ int result = SCAN_SUCCEED;
+
+ /*
+ * Copying pages' contents is subject to memory poison at any iteration.
+ */
+ for (_pte = pte, _address = address; _pte < pte + HPAGE_PMD_NR;
+ _pte++, page++, _address += PAGE_SIZE) {
+ pteval = *_pte;
+ if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
+ clear_user_highpage(page, _address);
+ continue;
+ }
+ src_page = pte_page(pteval);
+ if (copy_mc_user_highpage(page, src_page, _address, vma) > 0) {
+ result = SCAN_COPY_MC;
+ break;
+ }
+ }
+
+ if (likely(result == SCAN_SUCCEED))
+ __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
+ compound_pagelist);
+ else
+ __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
+ compound_pagelist);
+
+ return result;
+}
+
static void khugepaged_alloc_sleep(void)
{
DEFINE_WAIT(wait);
@@ -976,12 +1060,19 @@ static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
GFP_TRANSHUGE);
int node = hpage_collapse_find_target_node(cc);
+ struct folio *folio;
if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
return SCAN_ALLOC_HUGE_PAGE_FAIL;
- if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
+
+ folio = page_folio(*hpage);
+ if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
+ folio_put(folio);
+ *hpage = NULL;
return SCAN_CGROUP_CHARGE_FAIL;
+ }
count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
+
return SCAN_SUCCEED;
}
@@ -1053,6 +1144,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
if (result != SCAN_SUCCEED)
goto out_up_write;
+ vma_start_write(vma);
anon_vma_lock_write(vma->anon_vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
@@ -1102,9 +1194,13 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
- &compound_pagelist);
+ result = __collapse_huge_page_copy(pte, hpage, pmd, _pmd,
+ vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
+ if (unlikely(result != SCAN_SUCCEED))
+ goto out_up_write;
+
/*
* spin_lock() below is not the equivalent of smp_wmb(), but
* the smp_wmb() inside __SetPageUptodate() can be reused to
@@ -1132,10 +1228,8 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
out_up_write:
mmap_write_unlock(mm);
out_nolock:
- if (hpage) {
- mem_cgroup_uncharge(page_folio(hpage));
+ if (hpage)
put_page(hpage);
- }
trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
return result;
}
@@ -1176,7 +1270,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm,
* enabled swap entries. Please see
* comment below for pte_uffd_wp().
*/
- if (pte_swp_uffd_wp(pteval)) {
+ if (pte_swp_uffd_wp_any(pteval)) {
result = SCAN_PTE_UFFD_WP;
goto out_unmap;
}
@@ -1516,6 +1610,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
goto drop_hpage;
}
+ /* Lock the vma before taking i_mmap and page table locks */
+ vma_start_write(vma);
+
/*
* We need to lock the mapping so that from here on, only GUP-fast and
* hardware page walks can access the parts of the page tables that
@@ -1693,6 +1790,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
result = SCAN_PTE_MAPPED_HUGEPAGE;
if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) {
+ /* trylock for the same lock inversion as above */
+ if (!vma_try_start_write(vma))
+ goto unlock_next;
+
/*
* Re-check whether we have an ->anon_vma, because
* collapse_and_free_pmd() requires that either no
@@ -1758,17 +1859,18 @@ next:
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
- * - scan page cache replacing old pages with the new one
+ * - scan page cache, locking old pages
* + swap/gup in pages if necessary;
- * + fill in gaps;
- * + keep old pages around in case rollback is required;
+ * - copy data to new page
+ * - handle shmem holes
+ * + re-validate that holes weren't filled by someone else
+ * + check for userfaultfd
+ * - finalize updates to the page cache;
* - if replacing succeeds:
- * + copy data over;
- * + free old pages;
* + unlock huge page;
+ * + free old pages;
* - if replacing failed;
- * + put all pages back and unfreeze them;
- * + restore gaps in the page cache;
+ * + unlock old pages
* + unlock and free huge page;
*/
static int collapse_file(struct mm_struct *mm, unsigned long addr,
@@ -1777,6 +1879,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
{
struct address_space *mapping = file->f_mapping;
struct page *hpage;
+ struct page *page;
+ struct page *tmp;
+ struct folio *folio;
pgoff_t index = 0, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1791,6 +1896,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
if (result != SCAN_SUCCEED)
goto out;
+ __SetPageLocked(hpage);
+ if (is_shmem)
+ __SetPageSwapBacked(hpage);
+ hpage->index = start;
+ hpage->mapping = mapping;
+
/*
* Ensure we have slots for all the pages in the range. This is
* almost certainly a no-op because most of the pages must be present
@@ -1803,26 +1914,13 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
xas_unlock_irq(&xas);
if (!xas_nomem(&xas, GFP_KERNEL)) {
result = SCAN_FAIL;
- goto out;
+ goto rollback;
}
} while (1);
- __SetPageLocked(hpage);
- if (is_shmem)
- __SetPageSwapBacked(hpage);
- hpage->index = start;
- hpage->mapping = mapping;
-
- /*
- * At this point the hpage is locked and not up-to-date.
- * It's safe to insert it into the page cache, because nobody would
- * be able to map it or use it in another way until we unlock it.
- */
-
xas_set(&xas, start);
for (index = start; index < end; index++) {
- struct page *page = xas_next(&xas);
- struct folio *folio;
+ page = xas_next(&xas);
VM_BUG_ON(index != xas.xa_index);
if (is_shmem) {
@@ -1837,13 +1935,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = SCAN_TRUNCATED;
goto xa_locked;
}
- xas_set(&xas, index);
+ xas_set(&xas, index + 1);
}
if (!shmem_charge(mapping->host, 1)) {
result = SCAN_FAIL;
goto xa_locked;
}
- xas_store(&xas, hpage);
nr_none++;
continue;
}
@@ -1856,6 +1953,8 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
result = SCAN_FAIL;
goto xa_unlocked;
}
+ /* drain pagevecs to help isolate_lru_page() */
+ lru_add_drain();
page = folio_file_page(folio, index);
} else if (trylock_page(page)) {
get_page(page);
@@ -1976,12 +2075,16 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON_PAGE(page != xas_load(&xas), page);
/*
- * The page is expected to have page_count() == 3:
+ * We control three references to the page:
* - we hold a pin on it;
* - one reference from page cache;
* - one from isolate_lru_page;
+ * If those are the only references, then any new usage of the
+ * page will have to fetch it from the page cache. That requires
+ * locking the page to handle truncate, so any new usage will be
+ * blocked until we unlock page after collapse/during rollback.
*/
- if (!page_ref_freeze(page, 3)) {
+ if (page_count(page) != 3) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
putback_lru_page(page);
@@ -1989,25 +2092,17 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
/*
- * Add the page to the list to be able to undo the collapse if
- * something go wrong.
+ * Accumulate the pages that are being collapsed.
*/
list_add_tail(&page->lru, &pagelist);
-
- /* Finally, replace with the new page. */
- xas_store(&xas, hpage);
continue;
out_unlock:
unlock_page(page);
put_page(page);
goto xa_unlocked;
}
- nr = thp_nr_pages(hpage);
- if (is_shmem)
- __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
- else {
- __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
+ if (!is_shmem) {
filemap_nr_thps_inc(mapping);
/*
* Paired with smp_mb() in do_dentry_open() to ensure
@@ -2018,21 +2113,10 @@ out_unlock:
smp_mb();
if (inode_is_open_for_write(mapping->host)) {
result = SCAN_FAIL;
- __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
filemap_nr_thps_dec(mapping);
- goto xa_locked;
}
}
- if (nr_none) {
- __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
- /* nr_none is always 0 for non-shmem. */
- __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
- }
-
- /* Join all the small entries into a single multi-index entry */
- xas_set_order(&xas, start, HPAGE_PMD_ORDER);
- xas_store(&xas, hpage);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@@ -2044,101 +2128,174 @@ xa_unlocked:
*/
try_to_unmap_flush();
- if (result == SCAN_SUCCEED) {
- struct page *page, *tmp;
- struct folio *folio;
+ if (result != SCAN_SUCCEED)
+ goto rollback;
- /*
- * Replacing old pages with new one has succeeded, now we
- * need to copy the content and free the old pages.
- */
- index = start;
- list_for_each_entry_safe(page, tmp, &pagelist, lru) {
- while (index < page->index) {
- clear_highpage(hpage + (index % HPAGE_PMD_NR));
- index++;
- }
- copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
- page);
- list_del(&page->lru);
- page->mapping = NULL;
- page_ref_unfreeze(page, 1);
- ClearPageActive(page);
- ClearPageUnevictable(page);
- unlock_page(page);
- put_page(page);
- index++;
- }
- while (index < end) {
+ /*
+ * The old pages are locked, so they won't change anymore.
+ */
+ index = start;
+ list_for_each_entry(page, &pagelist, lru) {
+ while (index < page->index) {
clear_highpage(hpage + (index % HPAGE_PMD_NR));
index++;
}
+ if (copy_mc_highpage(hpage + (page->index % HPAGE_PMD_NR), page) > 0) {
+ result = SCAN_COPY_MC;
+ goto rollback;
+ }
+ index++;
+ }
+ while (index < end) {
+ clear_highpage(hpage + (index % HPAGE_PMD_NR));
+ index++;
+ }
- folio = page_folio(hpage);
- folio_mark_uptodate(folio);
- folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (nr_none) {
+ struct vm_area_struct *vma;
+ int nr_none_check = 0;
- if (is_shmem)
- folio_mark_dirty(folio);
- folio_add_lru(folio);
+ i_mmap_lock_read(mapping);
+ xas_lock_irq(&xas);
- /*
- * Remove pte page tables, so we can re-fault the page as huge.
- */
- result = retract_page_tables(mapping, start, mm, addr, hpage,
- cc);
- unlock_page(hpage);
- hpage = NULL;
- } else {
- struct page *page;
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ if (!xas_next(&xas)) {
+ xas_store(&xas, XA_RETRY_ENTRY);
+ if (xas_error(&xas)) {
+ result = SCAN_STORE_FAILED;
+ goto immap_locked;
+ }
+ nr_none_check++;
+ }
+ }
- /* Something went wrong: roll back page cache changes */
- xas_lock_irq(&xas);
- if (nr_none) {
- mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
+ if (nr_none != nr_none_check) {
+ result = SCAN_PAGE_FILLED;
+ goto immap_locked;
}
- xas_set(&xas, start);
- xas_for_each(&xas, page, end - 1) {
- page = list_first_entry_or_null(&pagelist,
- struct page, lru);
- if (!page || xas.xa_index < page->index) {
- if (!nr_none)
- break;
- nr_none--;
- /* Put holes back where they were */
- xas_store(&xas, NULL);
- continue;
+ /*
+ * If userspace observed a missing page in a VMA with a MODE_MISSING
+ * userfaultfd, then it might expect a UFFD_EVENT_PAGEFAULT for that
+ * page. If so, we need to roll back to avoid suppressing such an
+ * event. Since wp/minor userfaultfds don't give userspace any
+ * guarantees that the kernel doesn't fill a missing page with a zero
+ * page, so they don't matter here.
+ *
+ * Any userfaultfds registered after this point will not be able to
+ * observe any missing pages due to the previously inserted retry
+ * entries.
+ */
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, start, end) {
+ if (userfaultfd_missing(vma)) {
+ result = SCAN_EXCEED_NONE_PTE;
+ goto immap_locked;
}
+ }
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
+immap_locked:
+ i_mmap_unlock_read(mapping);
+ if (result != SCAN_SUCCEED) {
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ if (xas_next(&xas) == XA_RETRY_ENTRY)
+ xas_store(&xas, NULL);
+ }
- /* Unfreeze the page. */
- list_del(&page->lru);
- page_ref_unfreeze(page, 2);
- xas_store(&xas, page);
- xas_pause(&xas);
xas_unlock_irq(&xas);
- unlock_page(page);
- putback_lru_page(page);
- xas_lock_irq(&xas);
+ goto rollback;
}
- VM_BUG_ON(nr_none);
+ } else {
+ xas_lock_irq(&xas);
+ }
+
+ nr = thp_nr_pages(hpage);
+ if (is_shmem)
+ __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
+ else
+ __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
+
+ if (nr_none) {
+ __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
+ /* nr_none is always 0 for non-shmem. */
+ __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
+ }
+
+ /*
+ * Mark hpage as uptodate before inserting it into the page cache so
+ * that it isn't mistaken for an fallocated but unwritten page.
+ */
+ folio = page_folio(hpage);
+ folio_mark_uptodate(folio);
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+
+ if (is_shmem)
+ folio_mark_dirty(folio);
+ folio_add_lru(folio);
+
+ /* Join all the small entries into a single multi-index entry. */
+ xas_set_order(&xas, start, HPAGE_PMD_ORDER);
+ xas_store(&xas, hpage);
+ WARN_ON_ONCE(xas_error(&xas));
+ xas_unlock_irq(&xas);
+
+ /*
+ * Remove pte page tables, so we can re-fault the page as huge.
+ */
+ result = retract_page_tables(mapping, start, mm, addr, hpage,
+ cc);
+ unlock_page(hpage);
+
+ /*
+ * The collapse has succeeded, so free the old pages.
+ */
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ list_del(&page->lru);
+ page->mapping = NULL;
+ ClearPageActive(page);
+ ClearPageUnevictable(page);
+ unlock_page(page);
+ folio_put_refs(page_folio(page), 3);
+ }
+
+ goto out;
+
+rollback:
+ /* Something went wrong: roll back page cache changes */
+ if (nr_none) {
+ xas_lock_irq(&xas);
+ mapping->nrpages -= nr_none;
+ shmem_uncharge(mapping->host, nr_none);
xas_unlock_irq(&xas);
+ }
- hpage->mapping = NULL;
+ list_for_each_entry_safe(page, tmp, &pagelist, lru) {
+ list_del(&page->lru);
+ unlock_page(page);
+ putback_lru_page(page);
+ put_page(page);
+ }
+ /*
+ * Undo the updates of filemap_nr_thps_inc for non-SHMEM
+ * file only. This undo is not needed unless failure is
+ * due to SCAN_COPY_MC.
+ */
+ if (!is_shmem && result == SCAN_COPY_MC) {
+ filemap_nr_thps_dec(mapping);
+ /*
+ * Paired with smp_mb() in do_dentry_open() to
+ * ensure the update to nr_thps is visible.
+ */
+ smp_mb();
}
- if (hpage)
- unlock_page(hpage);
+ hpage->mapping = NULL;
+
+ unlock_page(hpage);
+ put_page(hpage);
out:
VM_BUG_ON(!list_empty(&pagelist));
- if (hpage) {
- mem_cgroup_uncharge(page_folio(hpage));
- put_page(hpage);
- }
-
trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result);
return result;
}
@@ -2624,12 +2781,14 @@ static int madvise_collapse_errno(enum scan_result r)
case SCAN_ALLOC_HUGE_PAGE_FAIL:
return -ENOMEM;
case SCAN_CGROUP_CHARGE_FAIL:
+ case SCAN_EXCEED_NONE_PTE:
return -EBUSY;
/* Resource temporary unavailable - trying again might succeed */
case SCAN_PAGE_COUNT:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_LRU:
case SCAN_DEL_PAGE_LRU:
+ case SCAN_PAGE_FILLED:
return -EAGAIN;
/*
* Other: Trying again likely not to succeed / error intrinsic to