diff options
Diffstat (limited to 'mm')
51 files changed, 1702 insertions, 510 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index ab80933be65f..c1acc34c1c35 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -139,6 +139,10 @@ config HAVE_FAST_GUP config ARCH_KEEP_MEMBLOCK bool +# Keep arch NUMA mapping infrastructure post-init. +config NUMA_KEEP_MEMINFO + bool + config MEMORY_ISOLATION bool @@ -154,6 +158,7 @@ config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG + select NUMA_KEEP_MEMINFO if NUMA config MEMORY_HOTPLUG_SPARSE def_bool y @@ -237,6 +242,17 @@ config COMPACTION linux-mm@kvack.org. # +# support for free page reporting +config PAGE_REPORTING + bool "Free page reporting" + def_bool n + help + Free page reporting allows for the incremental acquisition of + free pages from the buddy allocator for the purpose of reporting + those pages to another entity, such as a hypervisor, so that the + memory can be freed within the host for other uses. + +# # support for page migration # config MIGRATION @@ -420,10 +436,6 @@ config THP_SWAP For selection by architectures with reasonable THP sizes. -config TRANSPARENT_HUGE_PAGECACHE - def_bool y - depends on TRANSPARENT_HUGEPAGE - # # UP and nommu archs use km based percpu allocator # @@ -526,7 +538,6 @@ config MEM_SOFT_DIRTY config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y - select CRYPTO_LZO select ZPOOL help A lightweight compressed cache for swap pages. It takes @@ -542,6 +553,123 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. +choice + prompt "Compressed cache for swap pages default compressor" + depends on ZSWAP + default ZSWAP_COMPRESSOR_DEFAULT_LZO + help + Selects the default compression algorithm for the compressed cache + for swap pages. + + For an overview what kind of performance can be expected from + a particular compression algorithm please refer to the benchmarks + available at the following LWN page: + https://lwn.net/Articles/751795/ + + If in doubt, select 'LZO'. + + The selection made here can be overridden by using the kernel + command line 'zswap.compressor=' option. + +config ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + bool "Deflate" + select CRYPTO_DEFLATE + help + Use the Deflate algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZO + bool "LZO" + select CRYPTO_LZO + help + Use the LZO algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_842 + bool "842" + select CRYPTO_842 + help + Use the 842 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4 + bool "LZ4" + select CRYPTO_LZ4 + help + Use the LZ4 algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + bool "LZ4HC" + select CRYPTO_LZ4HC + help + Use the LZ4HC algorithm as the default compression algorithm. + +config ZSWAP_COMPRESSOR_DEFAULT_ZSTD + bool "zstd" + select CRYPTO_ZSTD + help + Use the zstd algorithm as the default compression algorithm. +endchoice + +config ZSWAP_COMPRESSOR_DEFAULT + string + depends on ZSWAP + default "deflate" if ZSWAP_COMPRESSOR_DEFAULT_DEFLATE + default "lzo" if ZSWAP_COMPRESSOR_DEFAULT_LZO + default "842" if ZSWAP_COMPRESSOR_DEFAULT_842 + default "lz4" if ZSWAP_COMPRESSOR_DEFAULT_LZ4 + default "lz4hc" if ZSWAP_COMPRESSOR_DEFAULT_LZ4HC + default "zstd" if ZSWAP_COMPRESSOR_DEFAULT_ZSTD + default "" + +choice + prompt "Compressed cache for swap pages default allocator" + depends on ZSWAP + default ZSWAP_ZPOOL_DEFAULT_ZBUD + help + Selects the default allocator for the compressed cache for + swap pages. + The default is 'zbud' for compatibility, however please do + read the description of each of the allocators below before + making a right choice. + + The selection made here can be overridden by using the kernel + command line 'zswap.zpool=' option. + +config ZSWAP_ZPOOL_DEFAULT_ZBUD + bool "zbud" + select ZBUD + help + Use the zbud allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_Z3FOLD + bool "z3fold" + select Z3FOLD + help + Use the z3fold allocator as the default allocator. + +config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + bool "zsmalloc" + select ZSMALLOC + help + Use the zsmalloc allocator as the default allocator. +endchoice + +config ZSWAP_ZPOOL_DEFAULT + string + depends on ZSWAP + default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD + default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD + default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC + default "" + +config ZSWAP_DEFAULT_ON + bool "Enable the compressed cache for swap pages by default" + depends on ZSWAP + help + If selected, the compressed cache for swap pages will be enabled + at boot, otherwise it will be disabled. + + The selection made here can be overridden by using the kernel + command line 'zswap.enabled=' option. + config ZPOOL tristate "Common API for compressed memory storage" help @@ -714,7 +842,7 @@ config GUP_GET_PTE_LOW_HIGH config READ_ONLY_THP_FOR_FS bool "Read-only THP for filesystems (EXPERIMENTAL)" - depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM + depends on TRANSPARENT_HUGEPAGE && SHMEM help Allow khugepaged to put read-only file-backed pages in THP. diff --git a/mm/Makefile b/mm/Makefile index dbc8346d16ca..fccd3756b25f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -111,3 +111,4 @@ obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o obj-$(CONFIG_PTDUMP_CORE) += ptdump.o +obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 62f05f605fb5..c81b4f3a7268 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -491,8 +491,8 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); - /* triggers blkg destruction if cgwb_refcnt becomes zero */ - blkcg_cgwb_put(blkcg); + /* triggers blkg destruction if no online users left */ + blkcg_unpin_online(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); @@ -592,7 +592,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); - blkcg_cgwb_get(blkcg); + blkcg_pin_online(blkcg); css_get(memcg_css); css_get(blkcg_css); } @@ -220,7 +220,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, } /** - * cma_declare_contiguous() - reserve custom contiguous area + * cma_declare_contiguous_nid() - reserve custom contiguous area * @base: Base address of the reserved area optional, use 0 for any * @size: Size of the reserved area (in bytes), * @limit: End address of the reserved memory (optional, 0 for any). @@ -229,6 +229,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * @fixed: hint about where to place the reserved area * @name: The name of the area. See function cma_init_reserved_mem() * @res_cma: Pointer to store the created cma region. + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) @@ -238,10 +239,11 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * If @fixed is true, reserve contiguous area at exactly @base. If false, * reserve in range from @base to @limit. */ -int __init cma_declare_contiguous(phys_addr_t base, +int __init cma_declare_contiguous_nid(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, const char *name, struct cma **res_cma) + bool fixed, const char *name, struct cma **res_cma, + int nid) { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t highmem_start; @@ -336,14 +338,14 @@ int __init cma_declare_contiguous(phys_addr_t base, * memory in case of failure. */ if (base < highmem_start && limit > highmem_start) { - addr = memblock_phys_alloc_range(size, alignment, - highmem_start, limit); + addr = memblock_alloc_range_nid(size, alignment, + highmem_start, limit, nid, false); limit = highmem_start; } if (!addr) { - addr = memblock_phys_alloc_range(size, alignment, base, - limit); + addr = memblock_alloc_range_nid(size, alignment, base, + limit, nid, false); if (!addr) { ret = -ENOMEM; goto err; diff --git a/mm/compaction.c b/mm/compaction.c index df3da2f76fdc..46f0fcc93081 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -481,6 +481,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, */ static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags, struct compact_control *cc) + __acquires(lock) { /* Track if the lock is contended in async mode */ if (cc->mode == MIGRATE_ASYNC && !cc->contended) { @@ -989,7 +990,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); mod_node_page_state(page_pgdat(page), - NR_ISOLATED_ANON + page_is_file_cache(page), + NR_ISOLATED_ANON + page_is_file_lru(page), hpage_nr_pages(page)); isolate_success: diff --git a/mm/dmapool.c b/mm/dmapool.c index fe5d33060415..f9fb9bbd733e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -144,9 +144,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if (size < 4) size = 4; - if ((size % align) != 0) - size = ALIGN(size, align); - + size = ALIGN(size, align); allocation = max_t(size_t, size, PAGE_SIZE); if (!boundary) diff --git a/mm/filemap.c b/mm/filemap.c index 0fbdc8e30dd2..23a051a7ef0f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1693,6 +1693,11 @@ EXPORT_SYMBOL(pagecache_get_page); * Any shadow entries of evicted pages, or swap entries from * shmem/tmpfs, are included in the returned array. * + * If it finds a Transparent Huge Page, head or tail, find_get_entries() + * stops at that page: the caller is likely to have a better way to handle + * the compound page as a whole, and then skip its extent, than repeatedly + * calling find_get_entries() to return all its tails. + * * Return: the number of pages and shadow entries which were found. */ unsigned find_get_entries(struct address_space *mapping, @@ -1724,8 +1729,15 @@ unsigned find_get_entries(struct address_space *mapping, /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) goto put_page; - page = find_subpage(page, xas.xa_index); + /* + * Terminate early on finding a THP, to allow the caller to + * handle it all at once; but continue if this is hugetlbfs. + */ + if (PageTransHuge(page) && !PageHuge(page)) { + page = find_subpage(page, xas.xa_index); + nr_entries = ret + 1; + } export: indices[ret] = xas.xa_index; entries[ret] = page; @@ -351,7 +351,8 @@ static struct page *no_page_table(struct vm_area_struct *vma, * But we can only make this optimization where a hole would surely * be zero-filled if handle_mm_fault() actually did handle it. */ - if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + if ((flags & FOLL_DUMP) && + (vma_is_anonymous(vma) || !vma->vm_ops->fault)) return ERR_PTR(-EFAULT); return NULL; } @@ -1101,7 +1102,7 @@ retry: goto retry; case -EBUSY: ret = 0; - /* FALLTHRU */ + fallthrough; case -EFAULT: case -ENOMEM: case -EHWPOISON: @@ -1325,10 +1326,12 @@ retry: * start trying again otherwise it can loop forever. */ - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current)) { + if (!pages_done) + pages_done = -EINTR; break; + } - *locked = 1; ret = down_read_killable(&mm->mmap_sem); if (ret) { BUG_ON(ret > 0); @@ -1337,6 +1340,7 @@ retry: break; } + *locked = 1; ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED, pages, NULL, locked); if (!*locked) { @@ -1416,7 +1420,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, * We want mlock to succeed for regions that have any permissions * other than PROT_NONE. */ - if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + if (vma_is_accessible(vma)) gup_flags |= FOLL_FORCE; /* @@ -1676,7 +1680,7 @@ check_again: list_add_tail(&head->lru, &cma_page_list); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + - page_is_file_cache(head), + page_is_file_lru(head), hpage_nr_pages(head)); } } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b1e069e68189..6ecd1045113b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -326,7 +326,7 @@ static struct attribute *hugepage_attr[] = { &defrag_attr.attr, &use_zero_page_attr.attr, &hpage_pmd_size_attr.attr, -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM &shmem_enabled_attr.attr, #endif #ifdef CONFIG_DEBUG_VM @@ -597,6 +597,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); return VM_FAULT_FALLBACK; } @@ -824,11 +825,24 @@ out_unlock: pte_free(mm, pgtable); } -vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) +/** + * vmf_insert_pfn_pmd_prot - insert a pmd size pfn + * @vmf: Structure describing the fault + * @pfn: pfn to insert + * @pgprot: page protection to use + * @write: whether it's a write fault + * + * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and + * also consult the vmf_insert_mixed_prot() documentation when + * @pgprot != @vmf->vma->vm_page_prot. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, + pgprot_t pgprot, bool write) { unsigned long addr = vmf->address & PMD_MASK; struct vm_area_struct *vma = vmf->vma; - pgprot_t pgprot = vma->vm_page_prot; pgtable_t pgtable = NULL; /* @@ -856,7 +870,7 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma) @@ -902,11 +916,24 @@ out_unlock: spin_unlock(ptl); } -vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) +/** + * vmf_insert_pfn_pud_prot - insert a pud size pfn + * @vmf: Structure describing the fault + * @pfn: pfn to insert + * @pgprot: page protection to use + * @write: whether it's a write fault + * + * Insert a pud size pfn. See vmf_insert_pfn() for additional info and + * also consult the vmf_insert_mixed_prot() documentation when + * @pgprot != @vmf->vma->vm_page_prot. + * + * Return: vm_fault_t value. + */ +vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, + pgprot_t pgprot, bool write) { unsigned long addr = vmf->address & PUD_MASK; struct vm_area_struct *vma = vmf->vma; - pgprot_t pgprot = vma->vm_page_prot; /* * If we had pud_special, we could avoid all these restrictions, @@ -927,7 +954,7 @@ vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write); return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud); +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot); #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1017,6 +1044,14 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ret = -EAGAIN; pmd = *src_pmd; + /* + * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA + * does not have the VM_UFFD_WP, which means that the uffd + * fork event is not enabled. + */ + if (!(vma->vm_flags & VM_UFFD_WP)) + pmd = pmd_clear_uffd_wp(pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (unlikely(is_swap_pmd(pmd))) { swp_entry_t entry = pmd_to_swp_entry(pmd); @@ -1420,6 +1455,7 @@ alloc: put_page(page); ret |= VM_FAULT_FALLBACK; count_vm_event(THP_FAULT_FALLBACK); + count_vm_event(THP_FAULT_FALLBACK_CHARGE); goto out; } @@ -1819,7 +1855,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd, tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); - if (vma_is_dax(vma)) { + if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); @@ -1951,13 +1987,16 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * - HPAGE_PMD_NR is protections changed and TLB flush necessary */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, int prot_numa) + unsigned long addr, pgprot_t newprot, unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t entry; bool preserve_write; int ret; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) @@ -2024,6 +2063,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_modify(entry, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); + if (uffd_wp) { + entry = pmd_wrprotect(entry); + entry = pmd_mkuffd_wp(entry); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled by PF interrupt + * handler, then things like COW could be properly + * handled. + */ + entry = pmd_clear_uffd_wp(entry); + } ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); @@ -2083,7 +2133,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, */ pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); - if (vma_is_dax(vma)) { + if (vma_is_special_huge(vma)) { spin_unlock(ptl); /* No zero page support yet */ } else { @@ -2172,7 +2222,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct page *page; pgtable_t pgtable; pmd_t old_pmd, _pmd; - bool young, write, soft_dirty, pmd_migration = false; + bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; unsigned long addr; int i; @@ -2192,7 +2242,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, */ if (arch_needs_pgtable_deposit()) zap_deposited_table(mm, pmd); - if (vma_is_dax(vma)) + if (vma_is_special_huge(vma)) return; page = pmd_page(_pmd); if (!PageDirty(page) && pmd_dirty(_pmd)) @@ -2247,6 +2297,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = is_write_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); + uffd_wp = pmd_swp_uffd_wp(old_pmd); } else { page = pmd_page(old_pmd); if (pmd_dirty(old_pmd)) @@ -2254,6 +2305,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(old_pmd); young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); + uffd_wp = pmd_uffd_wp(old_pmd); } VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); @@ -2278,6 +2330,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = swp_entry_to_pte(swp_entry); if (soft_dirty) entry = pte_swp_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_swp_mkuffd_wp(entry); } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); @@ -2287,6 +2341,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mkold(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f9ea1e5197b4..cd459155d28a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -28,6 +28,7 @@ #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> +#include <linux/cma.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -44,6 +45,9 @@ int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; + +static struct cma *hugetlb_cma[MAX_NUMNODES]; + /* * Minimum page order among possible hugepage sizes, set to a proper value * at boot time. @@ -1228,6 +1232,14 @@ static void destroy_compound_gigantic_page(struct page *page, static void free_gigantic_page(struct page *page, unsigned int order) { + /* + * If the page isn't allocated using the cma allocator, + * cma_release() returns false. + */ + if (IS_ENABLED(CONFIG_CMA) && + cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) + return; + free_contig_range(page_to_pfn(page), 1 << order); } @@ -1237,6 +1249,21 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, { unsigned long nr_pages = 1UL << huge_page_order(h); + if (IS_ENABLED(CONFIG_CMA)) { + struct page *page; + int node; + + for_each_node_mask(node, *nodemask) { + if (!hugetlb_cma[node]) + continue; + + page = cma_alloc(hugetlb_cma[node], nr_pages, + huge_page_order(h), true); + if (page) + return page; + } + } + return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); } @@ -1281,8 +1308,14 @@ static void update_and_free_page(struct hstate *h, struct page *page) set_compound_page_dtor(page, NULL_COMPOUND_DTOR); set_page_refcounted(page); if (hstate_is_gigantic(h)) { + /* + * Temporarily drop the hugetlb_lock, because + * we might block in free_gigantic_page(). + */ + spin_unlock(&hugetlb_lock); destroy_compound_gigantic_page(page, huge_page_order(h)); free_gigantic_page(page, huge_page_order(h)); + spin_lock(&hugetlb_lock); } else { __free_pages(page, huge_page_order(h)); } @@ -2010,6 +2043,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, * of size 'delta'. */ static int gather_surplus_pages(struct hstate *h, int delta) + __must_hold(&hugetlb_lock) { struct list_head surplus_list; struct page *page, *tmp; @@ -2538,6 +2572,10 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { + if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) { + pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n"); + break; + } if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_pool_huge_page(h, @@ -3193,6 +3231,7 @@ static int __init hugetlb_init(void) default_hstate.max_huge_pages = default_hstate_max_huge_pages; } + hugetlb_cma_check(); hugetlb_init_hstates(); gather_bootmem_prealloc(); report_hugepages(); @@ -5505,3 +5544,74 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) spin_unlock(&hugetlb_lock); } } + +#ifdef CONFIG_CMA +static unsigned long hugetlb_cma_size __initdata; +static bool cma_reserve_called __initdata; + +static int __init cmdline_parse_hugetlb_cma(char *p) +{ + hugetlb_cma_size = memparse(p, &p); + return 0; +} + +early_param("hugetlb_cma", cmdline_parse_hugetlb_cma); + +void __init hugetlb_cma_reserve(int order) +{ + unsigned long size, reserved, per_node; + int nid; + + cma_reserve_called = true; + + if (!hugetlb_cma_size) + return; + + if (hugetlb_cma_size < (PAGE_SIZE << order)) { + pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n", + (PAGE_SIZE << order) / SZ_1M); + return; + } + + /* + * If 3 GB area is requested on a machine with 4 numa nodes, + * let's allocate 1 GB on first three nodes and ignore the last one. + */ + per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes); + pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n", + hugetlb_cma_size / SZ_1M, per_node / SZ_1M); + + reserved = 0; + for_each_node_state(nid, N_ONLINE) { + int res; + + size = min(per_node, hugetlb_cma_size - reserved); + size = round_up(size, PAGE_SIZE << order); + + res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order, + 0, false, "hugetlb", + &hugetlb_cma[nid], nid); + if (res) { + pr_warn("hugetlb_cma: reservation failed: err %d, node %d", + res, nid); + continue; + } + + reserved += size; + pr_info("hugetlb_cma: reserved %lu MiB on node %d\n", + size / SZ_1M, nid); + + if (reserved >= hugetlb_cma_size) + break; + } +} + +void __init hugetlb_cma_check(void) +{ + if (!hugetlb_cma_size || cma_reserve_called) + return; + + pr_warn("hugetlb_cma: the option isn't supported by current arch\n"); +} + +#endif /* CONFIG_CMA */ diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index c2d7ae6cabd1..aabf65d4d91b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -467,14 +467,14 @@ static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; - /* Fall through. */ + fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; - /* Fall through. */ + fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) @@ -514,7 +514,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; - /* Fall through. */ + fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( diff --git a/mm/internal.h b/mm/internal.h index 2d58ae15a958..b5634e78f01d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -180,6 +180,8 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn, } extern int __isolate_free_page(struct page *page, unsigned int order); +extern void __putback_isolated_page(struct page *page, unsigned int order, + int mt); extern void memblock_free_pages(struct page *page, unsigned long pfn, unsigned int order); extern void __free_pages_core(struct page *page, unsigned int order); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index e61b4a492218..2906358e42f0 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -15,7 +15,6 @@ */ #include <linux/export.h> -#include <linux/interrupt.h> #include <linux/init.h> #include <linux/kasan.h> #include <linux/kernel.h> @@ -42,28 +41,6 @@ #include "kasan.h" #include "../slab.h" -static inline int in_irqentry_text(unsigned long ptr) -{ - return (ptr >= (unsigned long)&__irqentry_text_start && - ptr < (unsigned long)&__irqentry_text_end) || - (ptr >= (unsigned long)&__softirqentry_text_start && - ptr < (unsigned long)&__softirqentry_text_end); -} - -static inline unsigned int filter_irq_stacks(unsigned long *entries, - unsigned int nr_entries) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (in_irqentry_text(entries[i])) { - /* Include the irqentry function into the stack. */ - return i + 1; - } - } - return nr_entries; -} - static inline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[KASAN_STACK_DEPTH]; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cf5c17d5e361..80f23c9da6b0 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -92,8 +92,16 @@ static void end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn) + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. + * Resetting this prevents additional WARN() from panicking the + * system on this thread. Other threads are blocked by the + * panic_mutex in panic(). + */ + panic_on_warn = 0; panic("panic_on_warn set ...\n"); + } kasan_enable_current(); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c659c68728bc..99d77ffb79c2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -29,6 +29,7 @@ enum scan_result { SCAN_PMD_NULL, SCAN_EXCEED_NONE_PTE, SCAN_PTE_NON_PRESENT, + SCAN_PTE_UFFD_WP, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -414,8 +415,6 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file && (vm_flags & VM_DENYWRITE))) { - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return false; return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR); } @@ -513,7 +512,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page)); unlock_page(page); putback_lru_page(page); } @@ -613,7 +612,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } inc_node_page_state(page, - NR_ISOLATED_ANON + page_is_file_cache(page)); + NR_ISOLATED_ANON + page_is_file_lru(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -1139,6 +1138,15 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, pte_t pteval = *_pte; if (is_swap_pte(pteval)) { if (++unmapped <= khugepaged_max_ptes_swap) { + /* + * Always be strict with uffd-wp + * enabled swap entries. Please see + * comment below for pte_uffd_wp(). + */ + if (pte_swp_uffd_wp(pteval)) { + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } continue; } else { result = SCAN_EXCEED_SWAP_PTE; @@ -1158,6 +1166,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PTE_NON_PRESENT; goto out_unmap; } + if (pte_uffd_wp(pteval)) { + /* + * Don't collapse the page if any of the small + * PTEs are armed with uffd write protection. + * Here we can also mark the new huge pmd as + * write protected if any of the small ones is + * marked but that could bring uknown + * userfault messages that falls outside of + * the registered range. So, just be simple. + */ + result = SCAN_PTE_UFFD_WP; + goto out_unmap; + } if (pte_write(pteval)) writable = true; @@ -1258,7 +1279,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) } } -#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) +#ifdef CONFIG_SHMEM /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. @@ -1973,6 +1994,8 @@ skip: if (khugepaged_scan.address < hstart) khugepaged_scan.address = hstart; VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma)) + goto skip; while (khugepaged_scan.address < hend) { int ret; @@ -1984,14 +2007,10 @@ skip: khugepaged_scan.address + HPAGE_PMD_SIZE > hend); if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { - struct file *file; + struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, khugepaged_scan.address); - if (shmem_file(vma->vm_file) - && !shmem_huge_enabled(vma)) - goto skip; - file = get_file(vma->vm_file); up_read(&mm->mmap_sem); ret = 1; khugepaged_scan_file(mm, file, pgoff, hpage); @@ -455,7 +455,7 @@ static inline bool ksm_test_exit(struct mm_struct *mm) /* * We use break_ksm to break COW on a ksm page: it's a stripped down * - * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) + * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) * put_page(page); * * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, @@ -2813,8 +2813,7 @@ static int ksm_memory_callback(struct notifier_block *self, */ ksm_check_stable_tree(mn->start_pfn, mn->start_pfn + mn->nr_pages); - /* fallthrough */ - + fallthrough; case MEM_CANCEL_OFFLINE: mutex_lock(&ksm_thread_mutex); ksm_run &= ~KSM_RUN_OFFLINE; diff --git a/mm/list_lru.c b/mm/list_lru.c index 8de5e3784ee4..4d5294c39bba 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -223,7 +223,7 @@ restart: switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); - /* fall through */ + fallthrough; case LRU_REMOVED: isolated++; nlru->nr_items--; diff --git a/mm/memblock.c b/mm/memblock.c index 4d06bbaded0f..c79ba6f9920c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1349,7 +1349,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, * Return: * Physical address of allocated memory block on success, %0 on failure. */ -static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, +phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, phys_addr_t align, phys_addr_t start, phys_addr_t end, int nid, bool exact_nid) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca194864d802..5beea03dd58a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2254,7 +2254,8 @@ static void reclaim_high(struct mem_cgroup *memcg, continue; memcg_memory_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); - } while ((memcg = parent_mem_cgroup(memcg))); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); } static void high_work_func(struct work_struct *work) @@ -2335,6 +2336,9 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg, usage = page_counter_read(&memcg->memory); high = READ_ONCE(memcg->high); + if (usage <= high) + continue; + /* * Prevent division by 0 in overage calculation by acting as if * it was a threshold of 1 page @@ -5812,7 +5816,7 @@ retry: switch (get_mctgt_type(vma, addr, ptent, &target)) { case MC_TARGET_DEVICE: device = true; - /* fall through */ + fallthrough; case MC_TARGET_PAGE: page = target.page; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1c961cd26c0b..a96364be8ab4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1810,7 +1810,7 @@ static int __soft_offline_page(struct page *page, int flags) */ if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); diff --git a/mm/memory.c b/mm/memory.c index 5c356a57b892..f703fe8c8346 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -733,6 +733,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { @@ -762,6 +764,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, is_cow_mapping(vm_flags)) { make_device_private_entry_read(&entry); pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } @@ -785,6 +789,14 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_mkclean(pte); pte = pte_mkold(pte); + /* + * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA + * does not have the VM_UFFD_WP, which means that the uffd + * fork event is not enabled. + */ + if (!(vm_flags & VM_UFFD_WP)) + pte = pte_clear_uffd_wp(pte); + page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); @@ -1407,8 +1419,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, - spinlock_t **ptl) +static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; @@ -1427,9 +1438,40 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); + return pmd; +} + +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pmd_t *pmd = walk_to_pmd(mm, addr); + + if (!pmd) + return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } +static int validate_page_before_insert(struct page *page) +{ + if (PageAnon(page) || PageSlab(page) || page_has_type(page)) + return -EINVAL; + flush_dcache_page(page); + return 0; +} + +static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, + unsigned long addr, struct page *page, pgprot_t prot) +{ + if (!pte_none(*pte)) + return -EBUSY; + /* Ok, finally just insert the thing.. */ + get_page(page); + inc_mm_counter_fast(mm, mm_counter_file(page)); + page_add_file_rmap(page, false); + set_pte_at(mm, addr, pte, mk_pte(page, prot)); + return 0; +} + /* * This is the old fallback for page remapping. * @@ -1445,31 +1487,135 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte_t *pte; spinlock_t *ptl; - retval = -EINVAL; - if (PageAnon(page) || PageSlab(page) || page_has_type(page)) + retval = validate_page_before_insert(page); + if (retval) goto out; retval = -ENOMEM; - flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - retval = -EBUSY; - if (!pte_none(*pte)) - goto out_unlock; - - /* Ok, finally just insert the thing.. */ - get_page(page); - inc_mm_counter_fast(mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); - - retval = 0; -out_unlock: + retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } +#ifdef pte_index +static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, struct page *page, pgprot_t prot) +{ + int err; + + if (!page_count(page)) + return -EINVAL; + err = validate_page_before_insert(page); + return err ? err : insert_page_into_pte_locked( + mm, pte_offset_map(pmd, addr), addr, page, prot); +} + +/* insert_pages() amortizes the cost of spinlock operations + * when inserting pages in a loop. Arch *must* define pte_index. + */ +static int insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num, pgprot_t prot) +{ + pmd_t *pmd = NULL; + spinlock_t *pte_lock = NULL; + struct mm_struct *const mm = vma->vm_mm; + unsigned long curr_page_idx = 0; + unsigned long remaining_pages_total = *num; + unsigned long pages_to_write_in_pmd; + int ret; +more: + ret = -EFAULT; + pmd = walk_to_pmd(mm, addr); + if (!pmd) + goto out; + + pages_to_write_in_pmd = min_t(unsigned long, + remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); + + /* Allocate the PTE if necessary; takes PMD lock once only. */ + ret = -ENOMEM; + if (pte_alloc(mm, pmd)) + goto out; + pte_lock = pte_lockptr(mm, pmd); + + while (pages_to_write_in_pmd) { + int pte_idx = 0; + const int batch_size = min_t(int, pages_to_write_in_pmd, 8); + + spin_lock(pte_lock); + for (; pte_idx < batch_size; ++pte_idx) { + int err = insert_page_in_batch_locked(mm, pmd, + addr, pages[curr_page_idx], prot); + if (unlikely(err)) { + spin_unlock(pte_lock); + ret = err; + remaining_pages_total -= pte_idx; + goto out; + } + addr += PAGE_SIZE; + ++curr_page_idx; + } + spin_unlock(pte_lock); + pages_to_write_in_pmd -= batch_size; + remaining_pages_total -= batch_size; + } + if (remaining_pages_total) + goto more; + ret = 0; +out: + *num = remaining_pages_total; + return ret; +} +#endif /* ifdef pte_index */ + +/** + * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. + * @vma: user vma to map to + * @addr: target start user address of these pages + * @pages: source kernel pages + * @num: in: number of pages to map. out: number of pages that were *not* + * mapped. (0 means all pages were successfully mapped). + * + * Preferred over vm_insert_page() when inserting multiple pages. + * + * In case of error, we may have mapped a subset of the provided + * pages. It is the caller's responsibility to account for this case. + * + * The same restrictions apply as in vm_insert_page(). + */ +int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ +#ifdef pte_index + const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + + if (addr < vma->vm_start || end_addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } + /* Defer page refcount checking till we're about to map that page. */ + return insert_pages(vma, addr, pages, num, vma->vm_page_prot); +#else + unsigned long idx = 0, pgcount = *num; + int err; + + for (; idx < pgcount; ++idx) { + err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); + if (err) + break; + } + *num = pgcount - idx; + return err; +#endif /* ifdef pte_index */ +} +EXPORT_SYMBOL(vm_insert_pages); + /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to @@ -1940,7 +2086,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, * @vma: user vma to map to * @addr: target user address to start at * @pfn: page frame number of kernel physical memory address - * @size: size of map area + * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. @@ -2752,6 +2898,11 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + if (userfaultfd_pte_wp(vma, *vmf->pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* @@ -3085,6 +3236,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); + if (pte_swp_uffd_wp(vmf->orig_pte)) { + pte = pte_mkuffd_wp(pte); + pte = pte_wrprotect(pte); + } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -3373,7 +3528,7 @@ map_pte: return 0; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3475,8 +3630,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, pte_t entry; vm_fault_t ret; - if (pmd_none(*vmf->pmd) && PageTransCompound(page) && - IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); @@ -3949,31 +4103,40 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(vmf->vma)) + if (vma_is_anonymous(vmf->vma)) { + if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd)) + return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf, orig_pmd); - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + } + if (vmf->vma->vm_ops->huge_fault) { + vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } - /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } -static inline bool vma_is_accessible(struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE); -} - static vm_fault_t create_huge_pud(struct vm_fault *vmf) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ + defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) - return VM_FAULT_FALLBACK; - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + goto split; + if (vmf->vma->vm_ops->huge_fault) { + vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } +split: + /* COW or write-notify not handled on PUD level: split pud.*/ + __split_huge_pud(vmf->vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 19389cdc16a5..fc0aad0bc1f5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -67,18 +67,17 @@ void put_online_mems(void) bool movable_node_enabled = false; #ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE -bool memhp_auto_online; +int memhp_default_online_type = MMOP_OFFLINE; #else -bool memhp_auto_online = true; +int memhp_default_online_type = MMOP_ONLINE; #endif -EXPORT_SYMBOL_GPL(memhp_auto_online); static int __init setup_memhp_default_state(char *str) { - if (!strcmp(str, "online")) - memhp_auto_online = true; - else if (!strcmp(str, "offline")) - memhp_auto_online = false; + const int online_type = memhp_online_type_from_str(str); + + if (online_type >= 0) + memhp_default_online_type = online_type; return 1; } @@ -105,7 +104,13 @@ static struct resource *register_memory_resource(u64 start, u64 size) unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; char *resource_name = "System RAM"; - if (start + size > max_mem_size) + /* + * Make sure value parsed from 'mem=' only restricts memory adding + * while booting, so that memory hotplug won't be impacted. Please + * refer to document of 'mem=' in kernel-parameters.txt for more + * details. + */ + if (start + size > max_mem_size && system_state < SYSTEM_RUNNING) return ERR_PTR(-E2BIG); /* @@ -299,11 +304,15 @@ static int check_hotplug_memory_addressable(unsigned long pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, - struct mhp_restrictions *restrictions) + struct mhp_params *params) { + const unsigned long end_pfn = pfn + nr_pages; + unsigned long cur_nr_pages; int err; - unsigned long nr, start_sec, end_sec; - struct vmem_altmap *altmap = restrictions->altmap; + struct vmem_altmap *altmap = params->altmap; + + if (WARN_ON_ONCE(!params->pgprot.pgprot)) + return -EINVAL; err = check_hotplug_memory_addressable(pfn, nr_pages); if (err) @@ -325,18 +334,13 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, if (err) return err; - start_sec = pfn_to_section_nr(pfn); - end_sec = pfn_to_section_nr(pfn + nr_pages - 1); - for (nr = start_sec; nr <= end_sec; nr++) { - unsigned long pfns; - - pfns = min(nr_pages, PAGES_PER_SECTION - - (pfn & ~PAGE_SECTION_MASK)); - err = sparse_add_section(nid, pfn, pfns, altmap); + for (; pfn < end_pfn; pfn += cur_nr_pages) { + /* Select all remaining pages up to the next section boundary */ + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); + err = sparse_add_section(nid, pfn, cur_nr_pages, altmap); if (err) break; - pfn += pfns; - nr_pages -= pfns; cond_resched(); } vmemmap_populate_print_last(); @@ -494,7 +498,7 @@ static void __remove_section(unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); + struct mem_section *ms = __pfn_to_section(pfn); if (WARN_ON_ONCE(!valid_section(ms))) return; @@ -528,7 +532,8 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages, for (; pfn < end_pfn; pfn += cur_nr_pages) { cond_resched(); /* Select all remaining pages up to the next section boundary */ - cur_nr_pages = min(end_pfn - pfn, -(pfn | PAGE_SECTION_MASK)); + cur_nr_pages = min(end_pfn - pfn, + SECTION_ALIGN_UP(pfn + 1) - pfn); __remove_section(pfn, cur_nr_pages, map_offset, altmap); map_offset = 0; } @@ -988,6 +993,7 @@ static int check_hotplug_memory_range(u64 start, u64 size) static int online_memory_block(struct memory_block *mem, void *arg) { + mem->online_type = memhp_default_online_type; return device_online(&mem->dev); } @@ -999,7 +1005,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { - struct mhp_restrictions restrictions = {}; + struct mhp_params params = { .pgprot = PAGE_KERNEL }; u64 start, size; bool new_node = false; int ret; @@ -1027,7 +1033,7 @@ int __ref add_memory_resource(int nid, struct resource *res) new_node = ret; /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, &restrictions); + ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) goto error; @@ -1060,7 +1066,7 @@ int __ref add_memory_resource(int nid, struct resource *res) mem_hotplug_done(); /* online pages if requested */ - if (memhp_auto_online) + if (memhp_default_online_type != MMOP_OFFLINE) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; @@ -1317,7 +1323,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) list_add_tail(&page->lru, &source); if (!__PageMovable(page)) inc_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); } else { pr_warn("failed to isolate pfn %lx\n", pfn); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5fb427aed612..48ba9729062e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -127,6 +127,32 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; +/** + * numa_map_to_online_node - Find closest online node + * @nid: Node id to start the search + * + * Lookup the next closest node by distance if @nid is not online. + */ +int numa_map_to_online_node(int node) +{ + int min_dist = INT_MAX, dist, n, min_node; + + if (node == NUMA_NO_NODE || node_online(node)) + return node; + + min_node = node; + for_each_online_node(n) { + dist = node_distance(node, n); + if (dist < min_dist) { + min_dist = dist; + min_node = n; + } + } + + return min_node; +} +EXPORT_SYMBOL_GPL(numa_map_to_online_node); + struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; @@ -442,6 +468,7 @@ static inline bool queue_pages_required(struct page *page, */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) + __releases(ptl) { int ret = 0; struct page *page; @@ -627,7 +654,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma, { int nr_updated; - nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); + nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); @@ -678,8 +705,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ - if (!is_vm_hugetlb_page(vma) && - (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) && + if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; @@ -881,7 +907,6 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) switch (p->mode) { case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; @@ -897,12 +922,15 @@ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) static int lookup_node(struct mm_struct *mm, unsigned long addr) { - struct page *p; + struct page *p = NULL; int err; int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); - if (err >= 0) { + if (err == 0) { + /* E.g. GUP interrupted by fatal signal */ + err = -EFAULT; + } else if (err > 0) { err = page_to_nid(p); put_page(p); } @@ -1023,7 +1051,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist, if (!isolate_lru_page(head)) { list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), + NR_ISOLATED_ANON + page_is_file_lru(head), hpage_nr_pages(head)); } else if (flags & MPOL_MF_STRICT) { /* @@ -2066,7 +2094,6 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask) break; case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: *mask = mempolicy->v.nodes; break; @@ -2333,7 +2360,6 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) switch (a->mode) { case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: diff --git a/mm/memremap.c b/mm/memremap.c index 9b2c97ceb775..03e38b7a38f1 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -7,6 +7,7 @@ #include <linux/mm.h> #include <linux/pfn_t.h> #include <linux/swap.h> +#include <linux/mmzone.h> #include <linux/swapops.h> #include <linux/types.h> #include <linux/wait_bit.h> @@ -14,6 +15,28 @@ static DEFINE_XARRAY(pgmap_array); +/* + * The memremap() and memremap_pages() interfaces are alternately used + * to map persistent memory namespaces. These interfaces place different + * constraints on the alignment and size of the mapping (namespace). + * memremap() can map individual PAGE_SIZE pages. memremap_pages() can + * only map subsections (2MB), and at least one architecture (PowerPC) + * the minimum mapping granularity of memremap_pages() is 16MB. + * + * The role of memremap_compat_align() is to communicate the minimum + * arch supported alignment of a namespace such that it can freely + * switch modes without violating the arch constraint. Namely, do not + * allow a namespace to be PAGE_SIZE aligned since that namespace may be + * reconfigured into a mode that requires SUBSECTION_SIZE alignment. + */ +#ifndef CONFIG_ARCH_HAS_MEMREMAP_COMPAT_ALIGN +unsigned long memremap_compat_align(void) +{ + return SUBSECTION_SIZE; +} +EXPORT_SYMBOL_GPL(memremap_compat_align); +#endif + #ifdef CONFIG_DEV_PAGEMAP_OPS DEFINE_STATIC_KEY_FALSE(devmap_managed_key); EXPORT_SYMBOL(devmap_managed_key); @@ -161,13 +184,13 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) { struct resource *res = &pgmap->res; struct dev_pagemap *conflict_pgmap; - struct mhp_restrictions restrictions = { + struct mhp_params params = { /* * We do not want any optional features only our own memmap */ .altmap = pgmap_altmap(pgmap), + .pgprot = PAGE_KERNEL, }; - pgprot_t pgprot = PAGE_KERNEL; int error, is_ram; bool need_devmap_managed = true; @@ -194,7 +217,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } break; case MEMORY_DEVICE_DEVDAX: + need_devmap_managed = false; + break; case MEMORY_DEVICE_PCI_P2PDMA: + params.pgprot = pgprot_noncached(params.pgprot); need_devmap_managed = false; break; default: @@ -259,8 +285,8 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) if (nid < 0) nid = numa_mem_id(); - error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0, - resource_size(res)); + error = track_pfn_remap(NULL, ¶ms.pgprot, PHYS_PFN(res->start), + 0, resource_size(res)); if (error) goto err_pfn_remap; @@ -279,7 +305,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) */ if (pgmap->type == MEMORY_DEVICE_PRIVATE) { error = add_pages(nid, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), &restrictions); + PHYS_PFN(resource_size(res)), ¶ms); } else { error = kasan_add_zero_shadow(__va(res->start), resource_size(res)); if (error) { @@ -288,7 +314,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) } error = arch_add_memory(nid, res->start, resource_size(res), - &restrictions); + ¶ms); } if (!error) { @@ -296,7 +322,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE]; move_pfn_range_to_zone(zone, PHYS_PFN(res->start), - PHYS_PFN(resource_size(res)), restrictions.altmap); + PHYS_PFN(resource_size(res)), params.altmap); } mem_hotplug_done(); diff --git a/mm/migrate.c b/mm/migrate.c index 7ded07081be9..7160c1556f79 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -hpage_nr_pages(page)); putback_lru_page(page); } } @@ -243,11 +243,15 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, entry = pte_to_swp_entry(*pvmw.pte); if (is_write_migration_entry(entry)) pte = maybe_mkwrite(pte, vma); + else if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); if (unlikely(is_zone_device_page(new))) { if (is_device_private_page(new)) { entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*pvmw.pte)) + pte = pte_mkuffd_wp(pte); } } @@ -647,6 +651,14 @@ void migrate_page_states(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); + /* + * PG_readahead shares the same bit with PG_reclaim. The above + * end_page_writeback() may clear PG_readahead mistakenly, so set the + * bit after that. + */ + if (PageReadahead(page)) + SetPageReadahead(newpage); + copy_page_owner(page, newpage); mem_cgroup_migrate(page, newpage); @@ -1211,7 +1223,7 @@ out: */ if (likely(!__PageMovable(page))) mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_cache(page), -hpage_nr_pages(page)); + page_is_file_lru(page), -hpage_nr_pages(page)); } /* @@ -1518,9 +1530,6 @@ static int do_move_pages_to_node(struct mm_struct *mm, { int err; - if (list_empty(pagelist)) - return 0; - err = migrate_pages(pagelist, alloc_new_node_page, NULL, node, MIGRATE_SYNC, MR_SYSCALL); if (err) @@ -1587,7 +1596,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, err = 1; list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), - NR_ISOLATED_ANON + page_is_file_cache(head), + NR_ISOLATED_ANON + page_is_file_lru(head), hpage_nr_pages(head)); } out_putpage: @@ -1602,6 +1611,32 @@ out: return err; } +static int move_pages_and_store_status(struct mm_struct *mm, int node, + struct list_head *pagelist, int __user *status, + int start, int i, unsigned long nr_pages) +{ + int err; + + if (list_empty(pagelist)) + return 0; + + err = do_move_pages_to_node(mm, pagelist, node); + if (err) { + /* + * Positive err means the number of failed + * pages to migrate. Since we are going to + * abort and return the number of non-migrated + * pages, so need to incude the rest of the + * nr_pages that have not been attempted as + * well. + */ + if (err > 0) + err += nr_pages - i - 1; + return err; + } + return store_status(status, start, node, i - start); +} + /* * Migrate an array of page address onto an array of nodes and fill * the corresponding array of status. @@ -1645,21 +1680,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, current_node = node; start = i; } else if (node != current_node) { - err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) { - /* - * Positive err means the number of failed - * pages to migrate. Since we are going to - * abort and return the number of non-migrated - * pages, so need to incude the rest of the - * nr_pages that have not been attempted as - * well. - */ - if (err > 0) - err += nr_pages - i - 1; - goto out; - } - err = store_status(status, start, current_node, i - start); + err = move_pages_and_store_status(mm, current_node, + &pagelist, status, start, i, nr_pages); if (err) goto out; start = i; @@ -1673,49 +1695,29 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, err = add_page_for_migration(mm, addr, current_node, &pagelist, flags & MPOL_MF_MOVE_ALL); - if (!err) { - /* The page is already on the target node */ - err = store_status(status, i, current_node, 1); - if (err) - goto out_flush; - continue; - } else if (err > 0) { + if (err > 0) { /* The page is successfully queued for migration */ continue; } - err = store_status(status, i, err, 1); + /* + * If the page is already on the target node (!err), store the + * node, otherwise, store the err. + */ + err = store_status(status, i, err ? : current_node, 1); if (err) goto out_flush; - err = do_move_pages_to_node(mm, &pagelist, current_node); - if (err) { - if (err > 0) - err += nr_pages - i - 1; + err = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); + if (err) goto out; - } - if (i > start) { - err = store_status(status, start, current_node, i - start); - if (err) - goto out; - } current_node = NUMA_NO_NODE; } out_flush: - if (list_empty(&pagelist)) - return err; - /* Make sure we do not overwrite the existing error */ - err1 = do_move_pages_to_node(mm, &pagelist, current_node); - /* - * Don't have to report non-attempted pages here since: - * - If the above loop is done gracefully all pages have been - * attempted. - * - If the above loop is aborted it means a fatal error - * happened, should return ret. - */ - if (!err1) - err1 = store_status(status, start, current_node, i - start); + err1 = move_pages_and_store_status(mm, current_node, &pagelist, + status, start, i, nr_pages); if (err >= 0) err = err1; out: @@ -1957,7 +1959,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) return 0; } - page_lru = page_is_file_cache(page); + page_lru = page_is_file_lru(page); mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru, hpage_nr_pages(page)); @@ -1993,7 +1995,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1 && page_is_file_cache(page) && + if (page_mapcount(page) != 1 && page_is_file_lru(page) && (vma->vm_flags & VM_EXEC)) goto out; @@ -2001,7 +2003,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, * Also do not migrate dirty pages as not all filesystems can move * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) goto out; isolated = numamigrate_isolate_page(pgdat, page); @@ -2016,7 +2018,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, if (!list_empty(&migratepages)) { list_del(&page->lru); dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + page_is_file_lru(page)); putback_lru_page(page); } isolated = 0; @@ -2046,7 +2048,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - int page_lru = page_is_file_cache(page); + int page_lru = page_is_file_lru(page); unsigned long start = address & HPAGE_PMD_MASK; new_page = alloc_pages_node(node, @@ -2340,6 +2342,8 @@ again: swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pte)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pte)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, addr, ptep, swp_pte); /* diff --git a/mm/mm_init.c b/mm/mm_init.c index 5c918388de99..7da6991d9435 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -37,7 +37,7 @@ void __init mminit_verify_zonelist(void) struct zonelist *zonelist; int i, listid, zoneid; - BUG_ON(MAX_ZONELISTS > 2); + BUILD_BUG_ON(MAX_ZONELISTS > 2); for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) { /* Identify the zone and nodelist */ diff --git a/mm/mmap.c b/mm/mmap.c index 94ae18398c59..f609e9ec4a25 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1224,7 +1224,7 @@ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct * return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && - !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC|VM_SOFTDIRTY)) && + !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } @@ -1460,7 +1460,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * with MAP_SHARED to preserve backward compatibility. */ flags &= LEGACY_MAP_MASK; - /* fall through */ + fallthrough; case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; @@ -1487,8 +1487,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_SHARED | VM_MAYSHARE; if (!(file->f_mode & FMODE_WRITE)) vm_flags &= ~(VM_MAYWRITE | VM_SHARED); - - /* fall through */ + fallthrough; case MAP_PRIVATE: if (!(file->f_mode & FMODE_READ)) return -EACCES; @@ -2124,6 +2123,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.align_mask = 0; + info.align_offset = 0; return vm_unmapped_area(&info); } #endif @@ -2165,6 +2165,7 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.align_mask = 0; + info.align_offset = 0; addr = vm_unmapped_area(&info); /* @@ -2358,8 +2359,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) gap_addr = TASK_SIZE; next = vma->vm_next; - if (next && next->vm_start < gap_addr && - (next->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2440,7 +2440,7 @@ int expand_downwards(struct vm_area_struct *vma, prev = vma->vm_prev; /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && - (prev->vm_flags & (VM_WRITE|VM_READ|VM_EXEC))) { + vma_is_accessible(prev)) { if (address - prev->vm_end < stack_guard_gap) return -ENOMEM; } diff --git a/mm/mprotect.c b/mm/mprotect.c index 311c0dadf71c..494192ca954b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; int target_node = NUMA_NO_NODE; + bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; + bool prot_numa = cp_flags & MM_CP_PROT_NUMA; + bool uffd_wp = cp_flags & MM_CP_UFFD_WP; + bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; /* * Can be called with only the mmap_sem for reading by @@ -98,7 +102,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * it cannot move them all from MIGRATE_ASYNC * context. */ - if (page_is_file_cache(page) && PageDirty(page)) + if (page_is_file_lru(page) && PageDirty(page)) continue; /* @@ -114,6 +118,19 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (preserve_write) ptent = pte_mk_savedwrite(ptent); + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled + * by PF interrupt handler, then + * things like COW could be properly + * handled. + */ + ptent = pte_clear_uffd_wp(ptent); + } + /* Avoid taking write faults for known dirty pages */ if (dirty_accountable && pte_dirty(ptent) && (pte_soft_dirty(ptent) || @@ -122,11 +139,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); pages++; - } else if (IS_ENABLED(CONFIG_MIGRATION)) { + } else if (is_swap_pte(oldpte)) { swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; if (is_write_migration_entry(entry)) { - pte_t newpte; /* * A protection check is difficult so * just be safe and disable write @@ -135,22 +152,28 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, newpte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); - set_pte_at(vma->vm_mm, addr, pte, newpte); - - pages++; - } - - if (is_write_device_private_entry(entry)) { - pte_t newpte; - + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_write_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See * copy_one_pte() for explanation. */ make_device_private_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_pte_at(vma->vm_mm, addr, pte, newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); pages++; } } @@ -188,7 +211,7 @@ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pmd_t *pmd; unsigned long next; @@ -229,7 +252,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, __split_huge_pmd(vma, pmd, addr, false, NULL); } else { int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); + newprot, cp_flags); if (nr_ptes) { if (nr_ptes == HPAGE_PMD_NR) { @@ -244,7 +267,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, /* fall through, the trans huge pmd just split */ } this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); pages += this_pages; next: cond_resched(); @@ -260,7 +283,7 @@ next: static inline unsigned long change_pud_range(struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { pud_t *pud; unsigned long next; @@ -272,7 +295,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, if (pud_none_or_clear_bad(pud)) continue; pages += change_pmd_range(vma, pud, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pud++, addr = next, addr != end); return pages; @@ -280,7 +303,7 @@ static inline unsigned long change_pud_range(struct vm_area_struct *vma, static inline unsigned long change_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, - pgprot_t newprot, int dirty_accountable, int prot_numa) + pgprot_t newprot, unsigned long cp_flags) { p4d_t *p4d; unsigned long next; @@ -292,7 +315,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, if (p4d_none_or_clear_bad(p4d)) continue; pages += change_pud_range(vma, p4d, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (p4d++, addr = next, addr != end); return pages; @@ -300,7 +323,7 @@ static inline unsigned long change_p4d_range(struct vm_area_struct *vma, static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -317,7 +340,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; pages += change_p4d_range(vma, pgd, addr, next, newprot, - dirty_accountable, prot_numa); + cp_flags); } while (pgd++, addr = next, addr != end); /* Only flush the TLB if we actually modified any entries: */ @@ -330,14 +353,17 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + unsigned long cp_flags) { unsigned long pages; + BUG_ON((cp_flags & MM_CP_UFFD_WP_ALL) == MM_CP_UFFD_WP_ALL); + if (is_vm_hugetlb_page(vma)) pages = hugetlb_change_protection(vma, start, end, newprot); else - pages = change_protection_range(vma, start, end, newprot, dirty_accountable, prot_numa); + pages = change_protection_range(vma, start, end, newprot, + cp_flags); return pages; } @@ -393,7 +419,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, */ if (arch_has_pfn_modify_check() && (vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) && - (newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) { + (newflags & VM_ACCESS_FLAGS) == 0) { pgprot_t new_pgprot = vm_get_page_prot(newflags); error = walk_page_range(current->mm, start, end, @@ -459,7 +485,7 @@ success: vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, - dirty_accountable, 0); + dirty_accountable ? MM_CP_DIRTY_ACCT : 0); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -572,7 +598,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, newflags |= (vma->vm_flags & ~mask_off_old_flags); /* newflags >> 4 shift VM_MAY% in place of VM_% */ - if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) { + if ((newflags & ~(newflags >> 4)) & VM_ACCESS_FLAGS) { error = -EACCES; goto out; } diff --git a/mm/mremap.c b/mm/mremap.c index a7e282ead438..c881abeba0bf 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -413,9 +413,20 @@ static unsigned long move_vma(struct vm_area_struct *vma, /* Always put back VM_ACCOUNT since we won't unmap */ vma->vm_flags |= VM_ACCOUNT; - vm_acct_memory(vma_pages(new_vma)); + vm_acct_memory(new_len >> PAGE_SHIFT); } + /* + * VMAs can actually be merged back together in copy_vma + * calling merge_vma. This can happen with anonymous vmas + * which have not yet been faulted, so if we were to consider + * this VMA split we'll end up adding VM_ACCOUNT on the + * next VMA, which is completely unrelated if this VMA + * was re-merged. + */ + if (split && new_vma == vma) + split = 0; + /* We always clear VM_LOCKED[ONFAULT] on the old vma */ vma->vm_flags &= VM_LOCKED_CLEAR_MASK; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e5f76da8cd4e..69827d4fa052 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -74,6 +74,7 @@ #include <asm/div64.h> #include "internal.h" #include "shuffle.h" +#include "page_reporting.h" /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); @@ -102,8 +103,8 @@ struct pcpu_drain { struct zone *zone; struct work_struct work; }; -DEFINE_MUTEX(pcpu_drain_mutex); -DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); +static DEFINE_MUTEX(pcpu_drain_mutex); +static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY volatile unsigned long latent_entropy __latent_entropy; @@ -864,6 +865,78 @@ compaction_capture(struct capture_control *capc, struct page *page, } #endif /* CONFIG_COMPACTION */ +/* Used for pages not on another list */ +static inline void add_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages not on another list */ +static inline void add_to_free_list_tail(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_add_tail(&page->lru, &area->free_list[migratetype]); + area->nr_free++; +} + +/* Used for pages which are on another list */ +static inline void move_to_free_list(struct page *page, struct zone *zone, + unsigned int order, int migratetype) +{ + struct free_area *area = &zone->free_area[order]; + + list_move(&page->lru, &area->free_list[migratetype]); +} + +static inline void del_page_from_free_list(struct page *page, struct zone *zone, + unsigned int order) +{ + /* clear reported state and update reported page count */ + if (page_reported(page)) + __ClearPageReported(page); + + list_del(&page->lru); + __ClearPageBuddy(page); + set_page_private(page, 0); + zone->free_area[order].nr_free--; +} + +/* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ +static inline bool +buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, + struct page *page, unsigned int order) +{ + struct page *higher_page, *higher_buddy; + unsigned long combined_pfn; + + if (order >= MAX_ORDER - 2) + return false; + + if (!pfn_valid_within(buddy_pfn)) + return false; + + combined_pfn = buddy_pfn & pfn; + higher_page = page + (combined_pfn - pfn); + buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); + higher_buddy = higher_page + (buddy_pfn - combined_pfn); + + return pfn_valid_within(buddy_pfn) && + page_is_buddy(higher_page, higher_buddy, order + 1); +} + /* * Freeing function for a buddy system allocator. * @@ -891,13 +964,14 @@ compaction_capture(struct capture_control *capc, struct page *page, static inline void __free_one_page(struct page *page, unsigned long pfn, struct zone *zone, unsigned int order, - int migratetype) + int migratetype, bool report) { - unsigned long combined_pfn; + struct capture_control *capc = task_capc(zone); unsigned long uninitialized_var(buddy_pfn); - struct page *buddy; + unsigned long combined_pfn; unsigned int max_order; - struct capture_control *capc = task_capc(zone); + struct page *buddy; + bool to_tail; max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1); @@ -932,7 +1006,7 @@ continue_merging: if (page_is_guard(buddy)) clear_page_guard(zone, buddy, order, migratetype); else - del_page_from_free_area(buddy, &zone->free_area[order]); + del_page_from_free_list(buddy, zone, order); combined_pfn = buddy_pfn & pfn; page = page + (combined_pfn - pfn); pfn = combined_pfn; @@ -966,35 +1040,19 @@ continue_merging: done_merging: set_page_order(page, order); - /* - * If this is not the largest possible page, check if the buddy - * of the next-highest order is free. If it is, it's possible - * that pages are being freed that will coalesce soon. In case, - * that is happening, add the free page to the tail of the list - * so it's less likely to be used soon and more likely to be merged - * as a higher order page - */ - if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn) - && !is_shuffle_order(order)) { - struct page *higher_page, *higher_buddy; - combined_pfn = buddy_pfn & pfn; - higher_page = page + (combined_pfn - pfn); - buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1); - higher_buddy = higher_page + (buddy_pfn - combined_pfn); - if (pfn_valid_within(buddy_pfn) && - page_is_buddy(higher_page, higher_buddy, order + 1)) { - add_to_free_area_tail(page, &zone->free_area[order], - migratetype); - return; - } - } - if (is_shuffle_order(order)) - add_to_free_area_random(page, &zone->free_area[order], - migratetype); + to_tail = shuffle_pick_tail(); + else + to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); + + if (to_tail) + add_to_free_list_tail(page, zone, order, migratetype); else - add_to_free_area(page, &zone->free_area[order], migratetype); + add_to_free_list(page, zone, order, migratetype); + /* Notify page reporting subsystem of freed page */ + if (report) + page_reporting_notify_free(order); } /* @@ -1311,7 +1369,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); - __free_one_page(page, page_to_pfn(page), zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, true); trace_mm_page_pcpu_drain(page, 0, mt); } spin_unlock(&zone->lock); @@ -1327,7 +1385,7 @@ static void free_one_page(struct zone *zone, is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } - __free_one_page(page, pfn, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype, true); spin_unlock(&zone->lock); } @@ -2008,13 +2066,11 @@ void __init init_cma_reserved_pageblock(struct page *page) * -- nyc */ static inline void expand(struct zone *zone, struct page *page, - int low, int high, struct free_area *area, - int migratetype) + int low, int high, int migratetype) { unsigned long size = 1 << high; while (high > low) { - area--; high--; size >>= 1; VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); @@ -2028,7 +2084,7 @@ static inline void expand(struct zone *zone, struct page *page, if (set_page_guard(zone, &page[size], high, migratetype)) continue; - add_to_free_area(&page[size], area, migratetype); + add_to_free_list(&page[size], zone, high, migratetype); set_page_order(&page[size], high); } } @@ -2186,8 +2242,8 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, page = get_page_from_free_area(area, migratetype); if (!page) continue; - del_page_from_free_area(page, area); - expand(zone, page, order, current_order, area, migratetype); + del_page_from_free_list(page, zone, current_order); + expand(zone, page, order, current_order, migratetype); set_pcppage_migratetype(page, migratetype); return page; } @@ -2261,7 +2317,7 @@ static int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_zone(page) != zone, page); order = page_order(page); - move_to_free_area(page, &zone->free_area[order], migratetype); + move_to_free_list(page, zone, order, migratetype); page += 1 << order; pages_moved += 1 << order; } @@ -2377,7 +2433,6 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, unsigned int alloc_flags, int start_type, bool whole_block) { unsigned int current_order = page_order(page); - struct free_area *area; int free_pages, movable_pages, alike_pages; int old_block_type; @@ -2448,8 +2503,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, return; single_page: - area = &zone->free_area[current_order]; - move_to_free_area(page, area, start_type); + move_to_free_list(page, zone, current_order, start_type); } /* @@ -3120,7 +3174,6 @@ EXPORT_SYMBOL_GPL(split_page); int __isolate_free_page(struct page *page, unsigned int order) { - struct free_area *area = &page_zone(page)->free_area[order]; unsigned long watermark; struct zone *zone; int mt; @@ -3146,7 +3199,7 @@ int __isolate_free_page(struct page *page, unsigned int order) /* Remove page from free list */ - del_page_from_free_area(page, area); + del_page_from_free_list(page, zone, order); /* * Set the pageblock if the isolated page is at least half of a @@ -3167,6 +3220,26 @@ int __isolate_free_page(struct page *page, unsigned int order) return 1UL << order; } +/** + * __putback_isolated_page - Return a now-isolated page back where we got it + * @page: Page that was isolated + * @order: Order of the isolated page + * @mt: The page's pageblock's migratetype + * + * This function is meant to return a page pulled from the free lists via + * __isolate_free_page back to the free lists they were pulled from. + */ +void __putback_isolated_page(struct page *page, unsigned int order, int mt) +{ + struct zone *zone = page_zone(page); + + /* zone lock should be held when this function is called */ + lockdep_assert_held(&zone->lock); + + /* Return isolated page to tail of freelist. */ + __free_one_page(page, page_to_pfn(page), zone, order, mt, false); +} + /* * Update NUMA hit/miss statistics * @@ -8713,7 +8786,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); offlined_pages += 1 << order; - del_page_from_free_area(page, &zone->free_area[order]); + del_page_from_free_list(page, zone, order); pfn += (1 << order); } spin_unlock_irqrestore(&zone->lock, flags); diff --git a/mm/page_ext.c b/mm/page_ext.c index 08ded037f89f..a3616f7a0e9e 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -303,11 +303,8 @@ static int __meminit online_page_ext(unsigned long start_pfn, VM_BUG_ON(!node_state(nid, N_ONLINE)); } - for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { - if (!pfn_in_present_section(pfn)) - continue; + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) fail = init_section_page_ext(pfn, nid); - } if (!fail) return 0; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index a9fd7c740c23..2c11a38d6e87 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -117,13 +117,11 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); + if (isolated_page) + __putback_isolated_page(page, order, migratetype); zone->nr_isolate_pageblock--; out: spin_unlock_irqrestore(&zone->lock, flags); - if (isolated_page) { - post_alloc_hook(page, order, __GFP_MOVABLE); - __free_pages(page, order); - } } static inline struct page * diff --git a/mm/page_reporting.c b/mm/page_reporting.c new file mode 100644 index 000000000000..3bbd471cfc81 --- /dev/null +++ b/mm/page_reporting.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/page_reporting.h> +#include <linux/gfp.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/scatterlist.h> + +#include "page_reporting.h" +#include "internal.h" + +#define PAGE_REPORTING_DELAY (2 * HZ) +static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; + +enum { + PAGE_REPORTING_IDLE = 0, + PAGE_REPORTING_REQUESTED, + PAGE_REPORTING_ACTIVE +}; + +/* request page reporting */ +static void +__page_reporting_request(struct page_reporting_dev_info *prdev) +{ + unsigned int state; + + /* Check to see if we are in desired state */ + state = atomic_read(&prdev->state); + if (state == PAGE_REPORTING_REQUESTED) + return; + + /* + * If reporting is already active there is nothing we need to do. + * Test against 0 as that represents PAGE_REPORTING_IDLE. + */ + state = atomic_xchg(&prdev->state, PAGE_REPORTING_REQUESTED); + if (state != PAGE_REPORTING_IDLE) + return; + + /* + * Delay the start of work to allow a sizable queue to build. For + * now we are limiting this to running no more than once every + * couple of seconds. + */ + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +/* notify prdev of free page reporting request */ +void __page_reporting_notify(void) +{ + struct page_reporting_dev_info *prdev; + + /* + * We use RCU to protect the pr_dev_info pointer. In almost all + * cases this should be present, however in the unlikely case of + * a shutdown this will be NULL and we should exit. + */ + rcu_read_lock(); + prdev = rcu_dereference(pr_dev_info); + if (likely(prdev)) + __page_reporting_request(prdev); + + rcu_read_unlock(); +} + +static void +page_reporting_drain(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, unsigned int nents, bool reported) +{ + struct scatterlist *sg = sgl; + + /* + * Drain the now reported pages back into their respective + * free lists/areas. We assume at least one page is populated. + */ + do { + struct page *page = sg_page(sg); + int mt = get_pageblock_migratetype(page); + unsigned int order = get_order(sg->length); + + __putback_isolated_page(page, order, mt); + + /* If the pages were not reported due to error skip flagging */ + if (!reported) + continue; + + /* + * If page was not comingled with another page we can + * consider the result to be "reported" since the page + * hasn't been modified, otherwise we will need to + * report on the new larger page when we make our way + * up to that higher order. + */ + if (PageBuddy(page) && page_order(page) == order) + __SetPageReported(page); + } while ((sg = sg_next(sg))); + + /* reinitialize scatterlist now that it is empty */ + sg_init_table(sgl, nents); +} + +/* + * The page reporting cycle consists of 4 stages, fill, report, drain, and + * idle. We will cycle through the first 3 stages until we cannot obtain a + * full scatterlist of pages, in that case we will switch to idle. + */ +static int +page_reporting_cycle(struct page_reporting_dev_info *prdev, struct zone *zone, + unsigned int order, unsigned int mt, + struct scatterlist *sgl, unsigned int *offset) +{ + struct free_area *area = &zone->free_area[order]; + struct list_head *list = &area->free_list[mt]; + unsigned int page_len = PAGE_SIZE << order; + struct page *page, *next; + long budget; + int err = 0; + + /* + * Perform early check, if free area is empty there is + * nothing to process so we can skip this free_list. + */ + if (list_empty(list)) + return err; + + spin_lock_irq(&zone->lock); + + /* + * Limit how many calls we will be making to the page reporting + * device for this list. By doing this we avoid processing any + * given list for too long. + * + * The current value used allows us enough calls to process over a + * sixteenth of the current list plus one additional call to handle + * any pages that may have already been present from the previous + * list processed. This should result in us reporting all pages on + * an idle system in about 30 seconds. + * + * The division here should be cheap since PAGE_REPORTING_CAPACITY + * should always be a power of 2. + */ + budget = DIV_ROUND_UP(area->nr_free, PAGE_REPORTING_CAPACITY * 16); + + /* loop through free list adding unreported pages to sg list */ + list_for_each_entry_safe(page, next, list, lru) { + /* We are going to skip over the reported pages. */ + if (PageReported(page)) + continue; + + /* + * If we fully consumed our budget then update our + * state to indicate that we are requesting additional + * processing and exit this list. + */ + if (budget < 0) { + atomic_set(&prdev->state, PAGE_REPORTING_REQUESTED); + next = page; + break; + } + + /* Attempt to pull page from list and place in scatterlist */ + if (*offset) { + if (!__isolate_free_page(page, order)) { + next = page; + break; + } + + /* Add page to scatter list */ + --(*offset); + sg_set_page(&sgl[*offset], page, page_len, 0); + + continue; + } + + /* + * Make the first non-reported page in the free list + * the new head of the free list before we release the + * zone lock. + */ + if (&page->lru != list && !list_is_first(&page->lru, list)) + list_rotate_to_front(&page->lru, list); + + /* release lock before waiting on report processing */ + spin_unlock_irq(&zone->lock); + + /* begin processing pages in local list */ + err = prdev->report(prdev, sgl, PAGE_REPORTING_CAPACITY); + + /* reset offset since the full list was reported */ + *offset = PAGE_REPORTING_CAPACITY; + + /* update budget to reflect call to report function */ + budget--; + + /* reacquire zone lock and resume processing */ + spin_lock_irq(&zone->lock); + + /* flush reported pages from the sg list */ + page_reporting_drain(prdev, sgl, PAGE_REPORTING_CAPACITY, !err); + + /* + * Reset next to first entry, the old next isn't valid + * since we dropped the lock to report the pages + */ + next = list_first_entry(list, struct page, lru); + + /* exit on error */ + if (err) + break; + } + + /* Rotate any leftover pages to the head of the freelist */ + if (&next->lru != list && !list_is_first(&next->lru, list)) + list_rotate_to_front(&next->lru, list); + + spin_unlock_irq(&zone->lock); + + return err; +} + +static int +page_reporting_process_zone(struct page_reporting_dev_info *prdev, + struct scatterlist *sgl, struct zone *zone) +{ + unsigned int order, mt, leftover, offset = PAGE_REPORTING_CAPACITY; + unsigned long watermark; + int err = 0; + + /* Generate minimum watermark to be able to guarantee progress */ + watermark = low_wmark_pages(zone) + + (PAGE_REPORTING_CAPACITY << PAGE_REPORTING_MIN_ORDER); + + /* + * Cancel request if insufficient free memory or if we failed + * to allocate page reporting statistics for the zone. + */ + if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA)) + return err; + + /* Process each free list starting from lowest order/mt */ + for (order = PAGE_REPORTING_MIN_ORDER; order < MAX_ORDER; order++) { + for (mt = 0; mt < MIGRATE_TYPES; mt++) { + /* We do not pull pages from the isolate free list */ + if (is_migrate_isolate(mt)) + continue; + + err = page_reporting_cycle(prdev, zone, order, mt, + sgl, &offset); + if (err) + return err; + } + } + + /* report the leftover pages before going idle */ + leftover = PAGE_REPORTING_CAPACITY - offset; + if (leftover) { + sgl = &sgl[offset]; + err = prdev->report(prdev, sgl, leftover); + + /* flush any remaining pages out from the last report */ + spin_lock_irq(&zone->lock); + page_reporting_drain(prdev, sgl, leftover, !err); + spin_unlock_irq(&zone->lock); + } + + return err; +} + +static void page_reporting_process(struct work_struct *work) +{ + struct delayed_work *d_work = to_delayed_work(work); + struct page_reporting_dev_info *prdev = + container_of(d_work, struct page_reporting_dev_info, work); + int err = 0, state = PAGE_REPORTING_ACTIVE; + struct scatterlist *sgl; + struct zone *zone; + + /* + * Change the state to "Active" so that we can track if there is + * anyone requests page reporting after we complete our pass. If + * the state is not altered by the end of the pass we will switch + * to idle and quit scheduling reporting runs. + */ + atomic_set(&prdev->state, state); + + /* allocate scatterlist to store pages being reported on */ + sgl = kmalloc_array(PAGE_REPORTING_CAPACITY, sizeof(*sgl), GFP_KERNEL); + if (!sgl) + goto err_out; + + sg_init_table(sgl, PAGE_REPORTING_CAPACITY); + + for_each_zone(zone) { + err = page_reporting_process_zone(prdev, sgl, zone); + if (err) + break; + } + + kfree(sgl); +err_out: + /* + * If the state has reverted back to requested then there may be + * additional pages to be processed. We will defer for 2s to allow + * more pages to accumulate. + */ + state = atomic_cmpxchg(&prdev->state, state, PAGE_REPORTING_IDLE); + if (state == PAGE_REPORTING_REQUESTED) + schedule_delayed_work(&prdev->work, PAGE_REPORTING_DELAY); +} + +static DEFINE_MUTEX(page_reporting_mutex); +DEFINE_STATIC_KEY_FALSE(page_reporting_enabled); + +int page_reporting_register(struct page_reporting_dev_info *prdev) +{ + int err = 0; + + mutex_lock(&page_reporting_mutex); + + /* nothing to do if already in use */ + if (rcu_access_pointer(pr_dev_info)) { + err = -EBUSY; + goto err_out; + } + + /* initialize state and work structures */ + atomic_set(&prdev->state, PAGE_REPORTING_IDLE); + INIT_DELAYED_WORK(&prdev->work, &page_reporting_process); + + /* Begin initial flush of zones */ + __page_reporting_request(prdev); + + /* Assign device to allow notifications */ + rcu_assign_pointer(pr_dev_info, prdev); + + /* enable page reporting notification */ + if (!static_key_enabled(&page_reporting_enabled)) { + static_branch_enable(&page_reporting_enabled); + pr_info("Free page reporting enabled\n"); + } +err_out: + mutex_unlock(&page_reporting_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(page_reporting_register); + +void page_reporting_unregister(struct page_reporting_dev_info *prdev) +{ + mutex_lock(&page_reporting_mutex); + + if (rcu_access_pointer(pr_dev_info) == prdev) { + /* Disable page reporting notification */ + RCU_INIT_POINTER(pr_dev_info, NULL); + synchronize_rcu(); + + /* Flush any existing work, and lock it out */ + cancel_delayed_work_sync(&prdev->work); + } + + mutex_unlock(&page_reporting_mutex); +} +EXPORT_SYMBOL_GPL(page_reporting_unregister); diff --git a/mm/page_reporting.h b/mm/page_reporting.h new file mode 100644 index 000000000000..aa6d37f4dc22 --- /dev/null +++ b/mm/page_reporting.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _MM_PAGE_REPORTING_H +#define _MM_PAGE_REPORTING_H + +#include <linux/mmzone.h> +#include <linux/pageblock-flags.h> +#include <linux/page-isolation.h> +#include <linux/jump_label.h> +#include <linux/slab.h> +#include <asm/pgtable.h> +#include <linux/scatterlist.h> + +#define PAGE_REPORTING_MIN_ORDER pageblock_order + +#ifdef CONFIG_PAGE_REPORTING +DECLARE_STATIC_KEY_FALSE(page_reporting_enabled); +void __page_reporting_notify(void); + +static inline bool page_reported(struct page *page) +{ + return static_branch_unlikely(&page_reporting_enabled) && + PageReported(page); +} + +/** + * page_reporting_notify_free - Free page notification to start page processing + * + * This function is meant to act as a screener for __page_reporting_notify + * which will determine if a give zone has crossed over the high-water mark + * that will justify us beginning page treatment. If we have crossed that + * threshold then it will start the process of pulling some pages and + * placing them in the batch list for treatment. + */ +static inline void page_reporting_notify_free(unsigned int order) +{ + /* Called from hot path in __free_one_page() */ + if (!static_branch_unlikely(&page_reporting_enabled)) + return; + + /* Determine if we have crossed reporting threshold */ + if (order < PAGE_REPORTING_MIN_ORDER) + return; + + /* This will add a few cycles, but should be called infrequently */ + __page_reporting_notify(); +} +#else /* CONFIG_PAGE_REPORTING */ +#define page_reported(_page) false + +static inline void page_reporting_notify_free(unsigned int order) +{ +} +#endif /* CONFIG_PAGE_REPORTING */ +#endif /*_MM_PAGE_REPORTING_H */ diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index a5a8b22816ff..32558063c3f9 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -3,7 +3,7 @@ * mm/percpu-debug.c * * Copyright (C) 2017 Facebook Inc. - * Copyright (C) 2017 Dennis Zhou <dennisz@fb.com> + * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * Prints statistics about the percpu allocator and backing chunks. */ diff --git a/mm/percpu.c b/mm/percpu.c index e9844086b236..d7e3bc649f4e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -6,7 +6,7 @@ * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. - * Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com> + * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is diff --git a/mm/rmap.c b/mm/rmap.c index 2df75a119c83..f79a206b271a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -275,19 +275,6 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { struct anon_vma_chain *avc, *pavc; struct anon_vma *root = NULL; - struct vm_area_struct *prev = dst->vm_prev, *pprev = src->vm_prev; - - /* - * If parent share anon_vma with its vm_prev, keep this sharing in in - * child. - * - * 1. Parent has vm_prev, which implies we have vm_prev. - * 2. Parent and its vm_prev have the same anon_vma. - */ - if (!dst->anon_vma && src->anon_vma && - pprev && pprev->anon_vma == src->anon_vma) - dst->anon_vma = prev->anon_vma; - list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { struct anon_vma *anon_vma; @@ -946,7 +933,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, set_pte_at(vma->vm_mm, address, pte, entry); ret = 1; } else { -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t *pmd = pvmw.pmd; pmd_t entry; @@ -1385,7 +1372,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, struct page *subpage; bool ret = true; struct mmu_notifier_range range; - enum ttu_flags flags = (enum ttu_flags)arg; + enum ttu_flags flags = (enum ttu_flags)(long)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) @@ -1515,6 +1502,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1614,6 +1603,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on @@ -1680,6 +1671,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = swp_entry_to_pte(entry); if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); + if (pte_uffd_wp(pteval)) + swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, diff --git a/mm/shmem.c b/mm/shmem.c index aad3ba74b0e9..d722eb830317 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -410,7 +410,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ static int shmem_huge __read_mostly; @@ -580,7 +580,7 @@ static long shmem_unused_huge_count(struct super_block *sb, struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } -#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY @@ -589,11 +589,11 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, { return 0; } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) { - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && shmem_huge != SHMEM_HUGE_DENY) return true; @@ -789,6 +789,32 @@ void shmem_unlock_mapping(struct address_space *mapping) } /* + * Check whether a hole-punch or truncation needs to split a huge page, + * returning true if no split was required, or the split has been successful. + * + * Eviction (or truncation to 0 size) should never need to split a huge page; + * but in rare cases might do so, if shmem_undo_range() failed to trylock on + * head, and then succeeded to trylock on tail. + * + * A split can only succeed when there are no additional references on the + * huge page: so the split below relies upon find_get_entries() having stopped + * when it found a subpage of the huge page, without getting further references. + */ +static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end) +{ + if (!PageTransCompound(page)) + return true; + + /* Just proceed to delete a huge page wholly within the range punched */ + if (PageHead(page) && + page->index >= start && page->index + HPAGE_PMD_NR <= end) + return true; + + /* Try to split huge page, so we can truly punch the hole or truncate */ + return split_huge_page(page) >= 0; +} + +/* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. */ @@ -838,31 +864,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (!trylock_page(page)) continue; - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); + if ((!unfalloc || !PageUptodate(page)) && + page_mapping(page) == mapping) { + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) truncate_inode_page(mapping, page); - } } unlock_page(page); } @@ -936,43 +942,25 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, lock_page(page); - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - /* - * Partial thp truncate due 'start' in middle - * of THP: don't need to look on these pages - * again on !pvec.nr restart. - */ - if (index != round_down(end, HPAGE_PMD_NR)) - start++; - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - truncate_inode_page(mapping, page); - } else { + if (page_mapping(page) != mapping) { /* Page was replaced by swap: retry */ unlock_page(page); index--; break; } + VM_BUG_ON_PAGE(PageWriteback(page), page); + if (shmem_punch_compound(page, start, end)) + truncate_inode_page(mapping, page); + else { + /* Wipe the page and don't get stuck */ + clear_highpage(page); + flush_dcache_page(page); + set_page_dirty(page); + if (index < + round_up(start, HPAGE_PMD_NR)) + start = index + 1; + } } unlock_page(page); } @@ -1059,7 +1047,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) * Part of the huge page can be beyond i_size: subject * to shrink under memory pressure. */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { spin_lock(&sbinfo->shrinklist_lock); /* * _careful to defend against unlocked access to @@ -1472,9 +1460,6 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, pgoff_t hindex; struct page *page; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return NULL; - hindex = round_down(index, HPAGE_PMD_NR); if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, XA_PRESENT)) @@ -1486,6 +1471,8 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); + else + count_vm_event(THP_FILE_FALLBACK); return page; } @@ -1511,7 +1498,7 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, int nr; int err = -ENOSPC; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; @@ -1813,17 +1800,20 @@ repeat: if (shmem_huge == SHMEM_HUGE_FORCE) goto alloc_huge; switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; case SHMEM_HUGE_NEVER: goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: + case SHMEM_HUGE_WITHIN_SIZE: { + loff_t i_size; + pgoff_t off; + off = round_up(index, HPAGE_PMD_NR); i_size = round_up(i_size_read(inode), PAGE_SIZE); if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) goto alloc_huge; - /* fallthrough */ + + fallthrough; + } case SHMEM_HUGE_ADVISE: if (sgp_huge == SGP_HUGE) goto alloc_huge; @@ -1871,8 +1861,13 @@ alloc_nohuge: error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, PageTransHuge(page)); - if (error) + if (error) { + if (PageTransHuge(page)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } goto unacct; + } error = shmem_add_to_page_cache(page, mapping, hindex, NULL, gfp & GFP_RECLAIM_MASK); if (error) { @@ -2089,7 +2084,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, get_area = current->mm->get_unmapped_area; addr = get_area(file, uaddr, len, pgoff, flags); - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; @@ -2228,7 +2223,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); @@ -3113,12 +3108,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - error = 0; + if (error && error != -EOPNOTSUPP) { + iput(inode); + return error; } inode->i_size = len-1; @@ -3243,7 +3235,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags); + return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3455,7 +3447,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && - !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; @@ -3601,7 +3593,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); @@ -3846,7 +3838,7 @@ static const struct super_operations shmem_ops = { .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif @@ -3908,7 +3900,7 @@ int __init shmem_init(void) goto out1; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else @@ -3924,7 +3916,7 @@ out2: return error; } -#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3976,9 +3968,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute shmem_enabled_attr = __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE bool shmem_huge_enabled(struct vm_area_struct *vma) { struct inode *inode = file_inode(vma->vm_file); @@ -4004,7 +3996,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; - /* fall through */ + fallthrough; case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ return (vma->vm_flags & VM_HUGEPAGE); @@ -4013,7 +4005,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) return false; } } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ @@ -4182,7 +4174,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < (vma->vm_end & HPAGE_PMD_MASK)) { khugepaged_enter(vma, vma->vm_flags); diff --git a/mm/shuffle.c b/mm/shuffle.c index c716059cbd3c..44406d9977c7 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -183,11 +183,11 @@ void __meminit __shuffle_free_memory(pg_data_t *pgdat) shuffle_zone(z); } -void add_to_free_area_random(struct page *page, struct free_area *area, - int migratetype) +bool shuffle_pick_tail(void) { static u64 rand; static u8 rand_bits; + bool ret; /* * The lack of locking is deliberate. If 2 threads race to @@ -198,10 +198,10 @@ void add_to_free_area_random(struct page *page, struct free_area *area, rand = get_random_u64(); } - if (rand & 1) - add_to_free_area(page, area, migratetype); - else - add_to_free_area_tail(page, area, migratetype); + ret = rand & 1; + rand_bits--; rand >>= 1; + + return ret; } diff --git a/mm/shuffle.h b/mm/shuffle.h index 777a257a0d2f..4d79f03b6658 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -22,6 +22,7 @@ enum mm_shuffle_ctl { DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl); extern void __shuffle_free_memory(pg_data_t *pgdat); +extern bool shuffle_pick_tail(void); static inline void shuffle_free_memory(pg_data_t *pgdat) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) @@ -44,6 +45,11 @@ static inline bool is_shuffle_order(int order) return order >= SHUFFLE_ORDER; } #else +static inline bool shuffle_pick_tail(void) +{ + return false; +} + static inline void shuffle_free_memory(pg_data_t *pgdat) { } diff --git a/mm/slab_common.c b/mm/slab_common.c index 5282f881d2f5..23c7500eea7d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -731,7 +731,7 @@ static void kmemcg_rcufn(struct rcu_head *head) /* * We need to grab blocking locks. Bounce to ->work. The * work item shares the space with the RCU head and can't be - * initialized eariler. + * initialized earlier. */ INIT_WORK(&s->memcg_params.work, kmemcg_workfn); queue_work(memcg_kmem_cache_wq, &s->memcg_params.work); @@ -1581,6 +1581,7 @@ static int slabinfo_open(struct inode *inode, struct file *file) } static const struct proc_ops slabinfo_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = slabinfo_open, .proc_read = seq_read, .proc_write = slabinfo_write, diff --git a/mm/slub.c b/mm/slub.c index 3098e0cf2899..332d4b459a90 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -449,6 +449,7 @@ static DEFINE_SPINLOCK(object_map_lock); * not vanish from under us. */ static unsigned long *get_map(struct kmem_cache *s, struct page *page) + __acquires(&object_map_lock) { void *p; void *addr = page_address(page); @@ -465,7 +466,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) return object_map; } -static void put_map(unsigned long *map) +static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); lockdep_assert_held(&object_map_lock); diff --git a/mm/sparse.c b/mm/sparse.c index f1af4d4ee80b..1aee5a481571 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -209,6 +209,7 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } +#ifdef CONFIG_SPARSEMEM_VMEMMAP static void subsection_mask_set(unsigned long *map, unsigned long pfn, unsigned long nr_pages) { @@ -243,6 +244,11 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) nr_pages -= pfns; } } +#else +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ +} +#endif /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) @@ -660,6 +666,55 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return -EINVAL; + + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return bitmap_empty(&ms->usage->subsection_map[0], + SUBSECTIONS_PER_SECTION); +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + struct mem_section *ms = __pfn_to_section(pfn); + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + unsigned long *subsection_map; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + return rc; +} #else struct page * __meminit populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap) @@ -703,48 +758,51 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } + +static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} + +static bool is_subsection_map_empty(struct mem_section *ms) +{ + return true; +} + +static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages) +{ + return 0; +} #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +/* + * To deactivate a memory region, there are 3 cases to handle across + * two configurations (SPARSEMEM_VMEMMAP={y,n}): + * + * 1. deactivation of a partial hot-added section (only possible in + * the SPARSEMEM_VMEMMAP=y case). + * a) section was present at memory init. + * b) section was hot-added post memory init. + * 2. deactivation of a complete hot-added section. + * 3. deactivation of a complete section from memory init. + * + * For 1, when subsection_map does not empty we will not be freeing the + * usage map, but still need to free the vmemmap range. + * + * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified + */ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; - DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; struct mem_section *ms = __pfn_to_section(pfn); bool section_is_early = early_section(ms); struct page *memmap = NULL; bool empty; - unsigned long *subsection_map = ms->usage - ? &ms->usage->subsection_map[0] : NULL; - - subsection_mask_set(map, pfn, nr_pages); - if (subsection_map) - bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); - if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), - "section already deactivated (%#lx + %ld)\n", - pfn, nr_pages)) + if (clear_subsection_map(pfn, nr_pages)) return; - /* - * There are 3 cases to handle across two configurations - * (SPARSEMEM_VMEMMAP={y,n}): - * - * 1/ deactivation of a partial hot-added section (only possible - * in the SPARSEMEM_VMEMMAP=y case). - * a/ section was present at memory init - * b/ section was hot-added post memory init - * 2/ deactivation of a complete hot-added section - * 3/ deactivation of a complete section from memory init - * - * For 1/, when subsection_map does not empty we will not be - * freeing the usage map, but still need to free the vmemmap - * range. - * - * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified - */ - bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); - empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION); + empty = is_subsection_map_empty(ms); if (empty) { unsigned long section_nr = pfn_to_section_nr(pfn); @@ -780,31 +838,19 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages, static struct page * __meminit section_activate(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; struct mem_section *ms = __pfn_to_section(pfn); struct mem_section_usage *usage = NULL; - unsigned long *subsection_map; struct page *memmap; int rc = 0; - subsection_mask_set(map, pfn, nr_pages); - if (!ms->usage) { usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); if (!usage) return ERR_PTR(-ENOMEM); ms->usage = usage; } - subsection_map = &ms->usage->subsection_map[0]; - - if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) - rc = -EINVAL; - else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) - rc = -EEXIST; - else - bitmap_or(subsection_map, map, subsection_map, - SUBSECTIONS_PER_SECTION); + rc = fill_subsection_map(pfn, nr_pages); if (rc) { if (usage) ms->usage = NULL; @@ -840,6 +886,10 @@ static struct page * __meminit section_activate(int nid, unsigned long pfn, * * This is only intended for hotplug. * + * Note that only VMEMMAP supports sub-section aligned hotplug, + * the proper alignment and size are gated by check_pfn_span(). + * + * * Return: * * 0 - On success. * * -EEXIST - Section has been present. diff --git a/mm/swap.c b/mm/swap.c index a4af8c999963..bf9a79fed62d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -276,7 +276,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); + int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru); @@ -394,7 +394,7 @@ void mark_page_accessed(struct page *page) else __lru_cache_activate_page(page); ClearPageReferenced(page); - if (page_is_file_cache(page)) + if (page_is_file_lru(page)) workingset_activation(page); } if (page_is_idle(page)) @@ -515,7 +515,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, return; active = PageActive(page); - file = page_is_file_cache(page); + file = page_is_file_lru(page); lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + active); @@ -548,7 +548,7 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); + int file = page_is_file_lru(page); int lru = page_lru_base_type(page); del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); @@ -573,9 +573,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageActive(page); ClearPageReferenced(page); /* - * lazyfree pages are clean anonymous pages. They have - * SwapBacked flag cleared to distinguish normal anonymous - * pages + * Lazyfree pages are clean anonymous pages. They have + * PG_swapbacked flag cleared, to distinguish them from normal + * anonymous pages */ ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); @@ -962,7 +962,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, if (page_evictable(page)) { lru = page_lru(page); - update_page_reclaim_stat(lruvec, page_is_file_cache(page), + update_page_reclaim_stat(lruvec, page_is_file_lru(page), PageActive(page)); if (was_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); @@ -1004,6 +1004,10 @@ void __pagevec_lru_add(struct pagevec *pvec) * ascending indexes. There may be holes in the indices due to * not-present entries. * + * Only one subpage of a Transparent Huge Page is returned in one call: + * allowing truncate_inode_pages_range() to evict the whole THP without + * cycling through a pagevec of extra references. + * * pagevec_lookup_entries() returns the number of entries which were * found. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 273a923c275c..5871a2aa86a5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2797,6 +2797,7 @@ static int swaps_open(struct inode *inode, struct file *file) } static const struct proc_ops swaps_proc_ops = { + .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = swaps_open, .proc_read = seq_read, .proc_lseek = seq_lseek, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bd96855f3961..512576e171ce 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -53,7 +53,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, unsigned long dst_addr, unsigned long src_addr, - struct page **pagep) + struct page **pagep, + bool wp_copy) { struct mem_cgroup *memcg; pte_t _dst_pte, *dst_pte; @@ -99,9 +100,13 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false)) goto out_release; - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); + _dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot)); + if (dst_vma->vm_flags & VM_WRITE) { + if (wp_copy) + _dst_pte = pte_mkuffd_wp(_dst_pte); + else + _dst_pte = pte_mkwrite(_dst_pte); + } dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); if (dst_vma->vm_file) { @@ -415,7 +420,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, unsigned long dst_addr, unsigned long src_addr, struct page **page, - bool zeropage) + bool zeropage, + bool wp_copy) { ssize_t err; @@ -432,11 +438,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm, if (!(dst_vma->vm_flags & VM_SHARED)) { if (!zeropage) err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, page); + dst_addr, src_addr, page, + wp_copy); else err = mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, dst_addr); } else { + VM_WARN_ON_ONCE(wp_copy); if (!zeropage) err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, @@ -454,7 +462,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_start, unsigned long len, bool zeropage, - bool *mmap_changing) + bool *mmap_changing, + __u64 mode) { struct vm_area_struct *dst_vma; ssize_t err; @@ -462,6 +471,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long src_addr, dst_addr; long copied; struct page *page; + bool wp_copy; /* * Sanitize the command parameters: @@ -508,6 +518,14 @@ retry: goto out_unlock; /* + * validate 'mode' now that we know the dst_vma: don't allow + * a wrprotect copy if the userfaultfd didn't register as WP. + */ + wp_copy = mode & UFFDIO_COPY_MODE_WP; + if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP)) + goto out_unlock; + + /* * If this is a HUGETLB vma, pass off to appropriate routine */ if (is_vm_hugetlb_page(dst_vma)) @@ -562,7 +580,7 @@ retry: BUG_ON(pmd_trans_huge(*dst_pmd)); err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - src_addr, &page, zeropage); + src_addr, &page, zeropage, wp_copy); cond_resched(); if (unlikely(err == -ENOENT)) { @@ -609,14 +627,68 @@ out: ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool *mmap_changing) + bool *mmap_changing, __u64 mode) { return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, - mmap_changing); + mmap_changing, mode); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0); +} + +int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, bool enable_wp, bool *mmap_changing) +{ + struct vm_area_struct *dst_vma; + pgprot_t newprot; + int err; + + /* + * Sanitize the command parameters: + */ + BUG_ON(start & ~PAGE_MASK); + BUG_ON(len & ~PAGE_MASK); + + /* Does the address range wrap, or is the span zero-sized? */ + BUG_ON(start + len <= start); + + down_read(&dst_mm->mmap_sem); + + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + + err = -ENOENT; + dst_vma = find_dst_vma(dst_mm, start, len); + /* + * Make sure the vma is not shared, that the dst range is + * both valid and fully within a single existing vma. + */ + if (!dst_vma || (dst_vma->vm_flags & VM_SHARED)) + goto out_unlock; + if (!userfaultfd_wp(dst_vma)) + goto out_unlock; + if (!vma_is_anonymous(dst_vma)) + goto out_unlock; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + change_protection(dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + + err = 0; +out_unlock: + up_read(&dst_mm->mmap_sem); + return err; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6b8eeb0ecee5..399f219544f7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3368,7 +3368,7 @@ retry: goto overflow; /* - * If required width exeeds current VA block, move + * If required width exceeds current VA block, move * base downwards and then recheck. */ if (base + end > va->va_end) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e8e690d2813..b06868fc4926 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -919,7 +919,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * exceptional entries and shadow exceptional entries in the * same address_space. */ - if (reclaimed && page_is_file_cache(page) && + if (reclaimed && page_is_file_lru(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(page, target_memcg); __delete_from_page_cache(page, shadow); @@ -1043,7 +1043,7 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page) || + if (!page_is_file_lru(page) || (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; @@ -1315,7 +1315,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * the rest of the LRU for clean pages and see * the same dirty pages again (PageReclaim). */ - if (page_is_file_cache(page) && + if (page_is_file_lru(page) && (!current_is_kswapd() || !PageReclaim(page) || !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* @@ -1459,7 +1459,7 @@ activate_locked: try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); if (!PageMlocked(page)) { - int type = page_is_file_cache(page); + int type = page_is_file_lru(page); SetPageActive(page); stat->nr_activate[type] += nr_pages; count_memcg_page_event(page, PGACTIVATE); @@ -1497,7 +1497,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { - if (page_is_file_cache(page) && !PageDirty(page) && + if (page_is_file_lru(page) && !PageDirty(page) && !__PageMovable(page) && !PageUnevictable(page)) { ClearPageActive(page); list_move(&page->lru, &clean_pages); @@ -2053,7 +2053,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) { list_add(&page->lru, &l_active); continue; } diff --git a/mm/vmstat.c b/mm/vmstat.c index c9c0d71f917f..96d21a792b57 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1256,9 +1256,12 @@ const char * const vmstat_text[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE "thp_fault_alloc", "thp_fault_fallback", + "thp_fault_fallback_charge", "thp_collapse_alloc", "thp_collapse_alloc_failed", "thp_file_alloc", + "thp_file_fallback", + "thp_file_fallback_charge", "thp_file_mapped", "thp_split_page", "thp_split_page_failed", diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 22d17ecfe7df..2f836a2b993f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -424,7 +424,7 @@ static void *zs_zpool_map(void *pool, unsigned long handle, case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; - case ZPOOL_MM_RW: /* fall through */ + case ZPOOL_MM_RW: default: zs_mm = ZS_MM_RW; break; @@ -891,12 +891,12 @@ static inline int trypin_tag(unsigned long handle) return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); } -static void pin_tag(unsigned long handle) +static void pin_tag(unsigned long handle) __acquires(bitlock) { bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); } -static void unpin_tag(unsigned long handle) +static void unpin_tag(unsigned long handle) __releases(bitlock) { bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); } @@ -1833,12 +1833,12 @@ static void migrate_lock_init(struct zspage *zspage) rwlock_init(&zspage->lock); } -static void migrate_read_lock(struct zspage *zspage) +static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) { read_lock(&zspage->lock); } -static void migrate_read_unlock(struct zspage *zspage) +static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) { read_unlock(&zspage->lock); } diff --git a/mm/zswap.c b/mm/zswap.c index 55094e63b72d..fbb782924ccc 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -77,8 +77,8 @@ static bool zswap_pool_reached_full; #define ZSWAP_PARAM_UNSET "" -/* Enable/disable zswap (disabled by default) */ -static bool zswap_enabled; +/* Enable/disable zswap */ +static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON); static int zswap_enabled_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_enabled_param_ops = { @@ -88,8 +88,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); /* Crypto compressor to use */ -#define ZSWAP_COMPRESSOR_DEFAULT "lzo" -static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; static int zswap_compressor_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_compressor_param_ops = { @@ -101,8 +100,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops, &zswap_compressor, 0644); /* Compressed storage zpool to use */ -#define ZSWAP_ZPOOL_DEFAULT "zbud" -static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; static int zswap_zpool_param_set(const char *, const struct kernel_param *); static struct kernel_param_ops zswap_zpool_param_ops = { .set = zswap_zpool_param_set, @@ -599,11 +597,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void) bool has_comp, has_zpool; has_comp = crypto_has_comp(zswap_compressor, 0, 0); - if (!has_comp && strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) { + if (!has_comp && strcmp(zswap_compressor, + CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) { pr_err("compressor %s not available, using default %s\n", - zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT); + zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT); param_free_charp(&zswap_compressor); - zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT; has_comp = crypto_has_comp(zswap_compressor, 0, 0); } if (!has_comp) { @@ -614,11 +613,12 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void) } has_zpool = zpool_has_pool(zswap_zpool_type); - if (!has_zpool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + if (!has_zpool && strcmp(zswap_zpool_type, + CONFIG_ZSWAP_ZPOOL_DEFAULT)) { pr_err("zpool %s not available, using default %s\n", - zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT); + zswap_zpool_type, CONFIG_ZSWAP_ZPOOL_DEFAULT); param_free_charp(&zswap_zpool_type); - zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT; has_zpool = zpool_has_pool(zswap_zpool_type); } if (!has_zpool) { |