diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2019-08-09 16:53:39 +0200 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2019-08-09 16:53:39 +0200 |
commit | 0e1c438c44dd9cde56effb44c5f1cfeda72e108d (patch) | |
tree | fa3492d4d7d8b7444e5d8ebe6c78210826333e4b /mm | |
parent | c096397c78f766db972f923433031f2dec01cae0 (diff) | |
parent | cdb2d3ee0436d74fa9092f2df46aaa6f9e03c969 (diff) | |
download | linux-0e1c438c44dd9cde56effb44c5f1cfeda72e108d.tar.gz linux-0e1c438c44dd9cde56effb44c5f1cfeda72e108d.tar.bz2 linux-0e1c438c44dd9cde56effb44c5f1cfeda72e108d.zip |
Merge tag 'kvmarm-fixes-for-5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm fixes for 5.3
- A bunch of switch/case fall-through annotation, fixing one actual bug
- Fix PMU reset bug
- Add missing exception class debug strings
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 55 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/cma.c | 15 | ||||
-rw-r--r-- | mm/gup.c | 9 | ||||
-rw-r--r-- | mm/hmm.c | 587 | ||||
-rw-r--r-- | mm/huge_memory.c | 11 | ||||
-rw-r--r-- | mm/maccess.c | 122 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 35 | ||||
-rw-r--r-- | mm/memory-failure.c | 6 | ||||
-rw-r--r-- | mm/memory.c | 62 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 337 | ||||
-rw-r--r-- | mm/mempolicy.c | 1 | ||||
-rw-r--r-- | mm/migrate.c | 35 | ||||
-rw-r--r-- | mm/nommu.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 33 | ||||
-rw-r--r-- | mm/shmem.c | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 3 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 21 | ||||
-rw-r--r-- | mm/sparse.c | 355 | ||||
-rw-r--r-- | mm/swap.c | 15 | ||||
-rw-r--r-- | mm/util.c | 75 | ||||
-rw-r--r-- | mm/vmscan.c | 44 | ||||
-rw-r--r-- | mm/z3fold.c | 43 | ||||
-rw-r--r-- | mm/zsmalloc.c | 12 |
25 files changed, 867 insertions, 1028 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 0b4352557dd5..56cec636a1fc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,8 +649,7 @@ config IDLE_PAGE_TRACKING See Documentation/admin-guide/mm/idle_page_tracking.rst for more details. -# arch_add_memory() comprehends device memory -config ARCH_HAS_ZONE_DEVICE +config ARCH_HAS_PTE_DEVMAP bool config ZONE_DEVICE @@ -658,7 +657,7 @@ config ZONE_DEVICE depends on MEMORY_HOTPLUG depends on MEMORY_HOTREMOVE depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_ZONE_DEVICE + depends on ARCH_HAS_PTE_DEVMAP select XARRAY_MULTI help @@ -670,47 +669,17 @@ config ZONE_DEVICE If FS_DAX is enabled, then say Y. -config ARCH_HAS_HMM_MIRROR - bool - default y - depends on (X86_64 || PPC64) - depends on MMU && 64BIT - -config ARCH_HAS_HMM_DEVICE - bool - default y - depends on (X86_64 || PPC64) - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP - depends on ARCH_HAS_ZONE_DEVICE - select XARRAY_MULTI - -config ARCH_HAS_HMM - bool - default y - depends on (X86_64 || PPC64) - depends on ZONE_DEVICE - depends on MMU && 64BIT - depends on MEMORY_HOTPLUG - depends on MEMORY_HOTREMOVE - depends on SPARSEMEM_VMEMMAP - config MIGRATE_VMA_HELPER bool config DEV_PAGEMAP_OPS bool -config HMM - bool - select MMU_NOTIFIER - select MIGRATE_VMA_HELPER - config HMM_MIRROR bool "HMM mirror CPU page table into a device page table" - depends on ARCH_HAS_HMM - select HMM + depends on (X86_64 || PPC64) + depends on MMU && 64BIT + select MMU_NOTIFIER help Select HMM_MIRROR if you want to mirror range of the CPU page table of a process into a device page table. Here, mirror means "keep synchronized". @@ -720,8 +689,7 @@ config HMM_MIRROR config DEVICE_PRIVATE bool "Unaddressable device memory (GPU memory, ...)" - depends on ARCH_HAS_HMM - select HMM + depends on ZONE_DEVICE select DEV_PAGEMAP_OPS help @@ -729,17 +697,6 @@ config DEVICE_PRIVATE memory; i.e., memory that is only accessible from the device (or group of devices). You likely also want to select HMM_MIRROR. -config DEVICE_PUBLIC - bool "Addressable device memory (like GPU memory)" - depends on ARCH_HAS_HMM - select HMM - select DEV_PAGEMAP_OPS - - help - Allows creation of struct pages to represent addressable device - memory; i.e., memory that is accessible from both the device and - the CPU - config FRAME_VECTOR bool diff --git a/mm/Makefile b/mm/Makefile index dc0746ca1109..338e528ad436 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -102,5 +102,5 @@ obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o -obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_HMM_MIRROR) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o @@ -278,6 +278,12 @@ int __init cma_declare_contiguous(phys_addr_t base, */ alignment = max(alignment, (phys_addr_t)PAGE_SIZE << max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); + if (fixed && base & (alignment - 1)) { + ret = -EINVAL; + pr_err("Region at %pa must be aligned to %pa bytes\n", + &base, &alignment); + goto err; + } base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); @@ -308,6 +314,13 @@ int __init cma_declare_contiguous(phys_addr_t base, if (limit == 0 || limit > memblock_end) limit = memblock_end; + if (base + size > limit) { + ret = -EINVAL; + pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n", + &size, &base, &limit); + goto err; + } + /* Reserve memory */ if (fixed) { if (memblock_is_region_reserved(base, size) || @@ -494,7 +507,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, * @pages: Allocated pages. * @count: Number of allocated pages. * - * This function releases memory allocated by alloc_cma(). + * This function releases memory allocated by cma_alloc(). * It returns false when provided pages do not belong to contiguous area and * true otherwise. */ @@ -609,13 +609,6 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) goto unmap; *page = pte_page(*pte); - - /* - * This should never happen (a device public page in the gate - * area). - */ - if (is_device_public_page(*page)) - goto unmap; } if (unlikely(!try_get_page(*page))) { ret = -ENOMEM; @@ -1902,7 +1895,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ -#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, unsigned long end, struct page **pages, int *nr) { @@ -20,26 +20,14 @@ #include <linux/swapops.h> #include <linux/hugetlb.h> #include <linux/memremap.h> +#include <linux/sched/mm.h> #include <linux/jump_label.h> #include <linux/dma-mapping.h> #include <linux/mmu_notifier.h> #include <linux/memory_hotplug.h> -#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) - -#if IS_ENABLED(CONFIG_HMM_MIRROR) static const struct mmu_notifier_ops hmm_mmu_notifier_ops; -static inline struct hmm *mm_get_hmm(struct mm_struct *mm) -{ - struct hmm *hmm = READ_ONCE(mm->hmm); - - if (hmm && kref_get_unless_zero(&hmm->kref)) - return hmm; - - return NULL; -} - /** * hmm_get_or_create - register HMM against an mm (HMM internal) * @@ -54,11 +42,16 @@ static inline struct hmm *mm_get_hmm(struct mm_struct *mm) */ static struct hmm *hmm_get_or_create(struct mm_struct *mm) { - struct hmm *hmm = mm_get_hmm(mm); - bool cleanup = false; + struct hmm *hmm; - if (hmm) - return hmm; + lockdep_assert_held_write(&mm->mmap_sem); + + /* Abuse the page_table_lock to also protect mm->hmm. */ + spin_lock(&mm->page_table_lock); + hmm = mm->hmm; + if (mm->hmm && kref_get_unless_zero(&mm->hmm->kref)) + goto out_unlock; + spin_unlock(&mm->page_table_lock); hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); if (!hmm) @@ -68,55 +61,50 @@ static struct hmm *hmm_get_or_create(struct mm_struct *mm) init_rwsem(&hmm->mirrors_sem); hmm->mmu_notifier.ops = NULL; INIT_LIST_HEAD(&hmm->ranges); - mutex_init(&hmm->lock); + spin_lock_init(&hmm->ranges_lock); kref_init(&hmm->kref); hmm->notifiers = 0; - hmm->dead = false; hmm->mm = mm; - spin_lock(&mm->page_table_lock); - if (!mm->hmm) - mm->hmm = hmm; - else - cleanup = true; - spin_unlock(&mm->page_table_lock); + hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; + if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { + kfree(hmm); + return NULL; + } - if (cleanup) - goto error; + mmgrab(hmm->mm); /* - * We should only get here if hold the mmap_sem in write mode ie on - * registration of first mirror through hmm_mirror_register() + * We hold the exclusive mmap_sem here so we know that mm->hmm is + * still NULL or 0 kref, and is safe to update. */ - hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; - if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) - goto error_mm; + spin_lock(&mm->page_table_lock); + mm->hmm = hmm; +out_unlock: + spin_unlock(&mm->page_table_lock); return hmm; +} -error_mm: - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); -error: +static void hmm_free_rcu(struct rcu_head *rcu) +{ + struct hmm *hmm = container_of(rcu, struct hmm, rcu); + + mmdrop(hmm->mm); kfree(hmm); - return NULL; } static void hmm_free(struct kref *kref) { struct hmm *hmm = container_of(kref, struct hmm, kref); - struct mm_struct *mm = hmm->mm; - mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); + spin_lock(&hmm->mm->page_table_lock); + if (hmm->mm->hmm == hmm) + hmm->mm->hmm = NULL; + spin_unlock(&hmm->mm->page_table_lock); - spin_lock(&mm->page_table_lock); - if (mm->hmm == hmm) - mm->hmm = NULL; - spin_unlock(&mm->page_table_lock); - - kfree(hmm); + mmu_notifier_unregister_no_release(&hmm->mmu_notifier, hmm->mm); + mmu_notifier_call_srcu(&hmm->rcu, hmm_free_rcu); } static inline void hmm_put(struct hmm *hmm) @@ -124,86 +112,73 @@ static inline void hmm_put(struct hmm *hmm) kref_put(&hmm->kref, hmm_free); } -void hmm_mm_destroy(struct mm_struct *mm) +static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct hmm *hmm; + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); + struct hmm_mirror *mirror; - spin_lock(&mm->page_table_lock); - hmm = mm_get_hmm(mm); - mm->hmm = NULL; - if (hmm) { - hmm->mm = NULL; - hmm->dead = true; - spin_unlock(&mm->page_table_lock); - hmm_put(hmm); + /* Bail out if hmm is in the process of being freed */ + if (!kref_get_unless_zero(&hmm->kref)) return; + + /* + * Since hmm_range_register() holds the mmget() lock hmm_release() is + * prevented as long as a range exists. + */ + WARN_ON(!list_empty_careful(&hmm->ranges)); + + down_read(&hmm->mirrors_sem); + list_for_each_entry(mirror, &hmm->mirrors, list) { + /* + * Note: The driver is not allowed to trigger + * hmm_mirror_unregister() from this thread. + */ + if (mirror->ops->release) + mirror->ops->release(mirror); } + up_read(&hmm->mirrors_sem); - spin_unlock(&mm->page_table_lock); + hmm_put(hmm); } -static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) +static void notifiers_decrement(struct hmm *hmm) { - struct hmm *hmm = mm_get_hmm(mm); - struct hmm_mirror *mirror; - struct hmm_range *range; - - /* Report this HMM as dying. */ - hmm->dead = true; + unsigned long flags; - /* Wake-up everyone waiting on any range. */ - mutex_lock(&hmm->lock); - list_for_each_entry(range, &hmm->ranges, list) { - range->valid = false; - } - wake_up_all(&hmm->wq); - mutex_unlock(&hmm->lock); + spin_lock_irqsave(&hmm->ranges_lock, flags); + hmm->notifiers--; + if (!hmm->notifiers) { + struct hmm_range *range; - down_write(&hmm->mirrors_sem); - mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, - list); - while (mirror) { - list_del_init(&mirror->list); - if (mirror->ops->release) { - /* - * Drop mirrors_sem so callback can wait on any pending - * work that might itself trigger mmu_notifier callback - * and thus would deadlock with us. - */ - up_write(&hmm->mirrors_sem); - mirror->ops->release(mirror); - down_write(&hmm->mirrors_sem); + list_for_each_entry(range, &hmm->ranges, list) { + if (range->valid) + continue; + range->valid = true; } - mirror = list_first_entry_or_null(&hmm->mirrors, - struct hmm_mirror, list); + wake_up_all(&hmm->wq); } - up_write(&hmm->mirrors_sem); - - hmm_put(hmm); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); } static int hmm_invalidate_range_start(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); struct hmm_mirror *mirror; struct hmm_update update; struct hmm_range *range; + unsigned long flags; int ret = 0; - VM_BUG_ON(!hmm); + if (!kref_get_unless_zero(&hmm->kref)) + return 0; update.start = nrange->start; update.end = nrange->end; update.event = HMM_UPDATE_INVALIDATE; update.blockable = mmu_notifier_range_blockable(nrange); - if (mmu_notifier_range_blockable(nrange)) - mutex_lock(&hmm->lock); - else if (!mutex_trylock(&hmm->lock)) { - ret = -EAGAIN; - goto out; - } + spin_lock_irqsave(&hmm->ranges_lock, flags); hmm->notifiers++; list_for_each_entry(range, &hmm->ranges, list) { if (update.end < range->start || update.start >= range->end) @@ -211,7 +186,7 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, range->valid = false; } - mutex_unlock(&hmm->lock); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); if (mmu_notifier_range_blockable(nrange)) down_read(&hmm->mirrors_sem); @@ -219,19 +194,23 @@ static int hmm_invalidate_range_start(struct mmu_notifier *mn, ret = -EAGAIN; goto out; } + list_for_each_entry(mirror, &hmm->mirrors, list) { - int ret; + int rc; - ret = mirror->ops->sync_cpu_device_pagetables(mirror, &update); - if (!update.blockable && ret == -EAGAIN) { - up_read(&hmm->mirrors_sem); + rc = mirror->ops->sync_cpu_device_pagetables(mirror, &update); + if (rc) { + if (WARN_ON(update.blockable || rc != -EAGAIN)) + continue; ret = -EAGAIN; - goto out; + break; } } up_read(&hmm->mirrors_sem); out: + if (ret) + notifiers_decrement(hmm); hmm_put(hmm); return ret; } @@ -239,24 +218,12 @@ out: static void hmm_invalidate_range_end(struct mmu_notifier *mn, const struct mmu_notifier_range *nrange) { - struct hmm *hmm = mm_get_hmm(nrange->mm); - - VM_BUG_ON(!hmm); + struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier); - mutex_lock(&hmm->lock); - hmm->notifiers--; - if (!hmm->notifiers) { - struct hmm_range *range; - - list_for_each_entry(range, &hmm->ranges, list) { - if (range->valid) - continue; - range->valid = true; - } - wake_up_all(&hmm->wq); - } - mutex_unlock(&hmm->lock); + if (!kref_get_unless_zero(&hmm->kref)) + return; + notifiers_decrement(hmm); hmm_put(hmm); } @@ -271,14 +238,15 @@ static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { * * @mirror: new mirror struct to register * @mm: mm to register against + * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments * * To start mirroring a process address space, the device driver must register * an HMM mirror struct. - * - * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! */ int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) { + lockdep_assert_held_write(&mm->mmap_sem); + /* Sanity check */ if (!mm || !mirror || !mirror->ops) return -EINVAL; @@ -298,23 +266,17 @@ EXPORT_SYMBOL(hmm_mirror_register); /* * hmm_mirror_unregister() - unregister a mirror * - * @mirror: new mirror struct to register + * @mirror: mirror struct to unregister * * Stop mirroring a process address space, and cleanup. */ void hmm_mirror_unregister(struct hmm_mirror *mirror) { - struct hmm *hmm = READ_ONCE(mirror->hmm); - - if (hmm == NULL) - return; + struct hmm *hmm = mirror->hmm; down_write(&hmm->mirrors_sem); - list_del_init(&mirror->list); - /* To protect us against double unregister ... */ - mirror->hmm = NULL; + list_del(&mirror->list); up_write(&hmm->mirrors_sem); - hmm_put(hmm); } EXPORT_SYMBOL(hmm_mirror_unregister); @@ -330,7 +292,7 @@ struct hmm_vma_walk { static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, bool write_fault, uint64_t *pfn) { - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; + unsigned int flags = FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; @@ -372,7 +334,7 @@ static int hmm_pfns_bad(unsigned long addr, * @fault: should we fault or not ? * @write_fault: write fault ? * @walk: mm_walk structure - * Returns: 0 on success, -EBUSY after page fault, or page fault error + * Return: 0 on success, -EBUSY after page fault, or page fault error * * This function will be called whenever pmd_none() or pte_none() returns true, * or whenever there is no page directory covering the virtual address range. @@ -550,7 +512,7 @@ static int hmm_vma_handle_pmd(struct mm_walk *walk, static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) { - if (pte_none(pte) || !pte_present(pte)) + if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte)) return 0; return pte_write(pte) ? range->flags[HMM_PFN_VALID] | range->flags[HMM_PFN_WRITE] : @@ -788,7 +750,6 @@ again: return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); -#ifdef CONFIG_HUGETLB_PAGE pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); for (i = 0; i < npages; ++i, ++pfn) { hmm_vma_walk->pgmap = get_dev_pagemap(pfn, @@ -804,9 +765,6 @@ again: } hmm_vma_walk->last = end; return 0; -#else - return -EINVAL; -#endif } split_huge_pud(walk->vma, pudp, addr); @@ -909,12 +867,14 @@ static void hmm_pfns_clear(struct hmm_range *range, * Track updates to the CPU page table see include/linux/hmm.h */ int hmm_range_register(struct hmm_range *range, - struct mm_struct *mm, + struct hmm_mirror *mirror, unsigned long start, unsigned long end, unsigned page_shift) { unsigned long mask = ((1UL << page_shift) - 1UL); + struct hmm *hmm = mirror->hmm; + unsigned long flags; range->valid = false; range->hmm = NULL; @@ -928,28 +888,24 @@ int hmm_range_register(struct hmm_range *range, range->start = start; range->end = end; - range->hmm = hmm_get_or_create(mm); - if (!range->hmm) - return -EFAULT; - - /* Check if hmm_mm_destroy() was call. */ - if (range->hmm->mm == NULL || range->hmm->dead) { - hmm_put(range->hmm); + /* Prevent hmm_release() from running while the range is valid */ + if (!mmget_not_zero(hmm->mm)) return -EFAULT; - } - /* Initialize range to track CPU page table update */ - mutex_lock(&range->hmm->lock); + /* Initialize range to track CPU page table updates. */ + spin_lock_irqsave(&hmm->ranges_lock, flags); - list_add_rcu(&range->list, &range->hmm->ranges); + range->hmm = hmm; + kref_get(&hmm->kref); + list_add(&range->list, &hmm->ranges); /* * If there are any concurrent notifiers we have to wait for them for * the range to be valid (see hmm_range_wait_until_valid()). */ - if (!range->hmm->notifiers) + if (!hmm->notifiers) range->valid = true; - mutex_unlock(&range->hmm->lock); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); return 0; } @@ -964,25 +920,31 @@ EXPORT_SYMBOL(hmm_range_register); */ void hmm_range_unregister(struct hmm_range *range) { - /* Sanity check this really should not happen. */ - if (range->hmm == NULL || range->end <= range->start) - return; + struct hmm *hmm = range->hmm; + unsigned long flags; - mutex_lock(&range->hmm->lock); - list_del_rcu(&range->list); - mutex_unlock(&range->hmm->lock); + spin_lock_irqsave(&hmm->ranges_lock, flags); + list_del_init(&range->list); + spin_unlock_irqrestore(&hmm->ranges_lock, flags); /* Drop reference taken by hmm_range_register() */ + mmput(hmm->mm); + hmm_put(hmm); + + /* + * The range is now invalid and the ref on the hmm is dropped, so + * poison the pointer. Leave other fields in place, for the caller's + * use. + */ range->valid = false; - hmm_put(range->hmm); - range->hmm = NULL; + memset(&range->hmm, POISON_INUSE, sizeof(range->hmm)); } EXPORT_SYMBOL(hmm_range_unregister); /* * hmm_range_snapshot() - snapshot CPU page table for a range * @range: range - * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid + * Return: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid * permission (for instance asking for write and range is read only), * -EAGAIN if you need to retry, -EFAULT invalid (ie either no valid * vma or it is illegal to access that range), number of valid pages @@ -1001,10 +963,7 @@ long hmm_range_snapshot(struct hmm_range *range) struct vm_area_struct *vma; struct mm_walk mm_walk; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; - + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ if (!range->valid) @@ -1015,9 +974,8 @@ long hmm_range_snapshot(struct hmm_range *range) return -EFAULT; if (is_vm_hugetlb_page(vma)) { - struct hstate *h = hstate_vma(vma); - - if (huge_page_shift(h) != range->page_shift && + if (huge_page_shift(hstate_vma(vma)) != + range->page_shift && range->page_shift != PAGE_SHIFT) return -EINVAL; } else { @@ -1066,7 +1024,7 @@ EXPORT_SYMBOL(hmm_range_snapshot); * hmm_range_fault() - try to fault some address in a virtual address range * @range: range being faulted * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of valid pages in range->pfns[] (from range start + * Return: number of valid pages in range->pfns[] (from range start * address). This may be zero. If the return value is negative, * then one of the following values may be returned: * @@ -1100,9 +1058,7 @@ long hmm_range_fault(struct hmm_range *range, bool block) struct mm_walk mm_walk; int ret; - /* Check if hmm_mm_destroy() was call. */ - if (hmm->mm == NULL || hmm->dead) - return -EFAULT; + lockdep_assert_held(&hmm->mm->mmap_sem); do { /* If range is no longer valid force retry. */ @@ -1184,7 +1140,7 @@ EXPORT_SYMBOL(hmm_range_fault); * @device: device against to dma map page to * @daddrs: dma address of mapped pages * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) - * Returns: number of pages mapped on success, -EAGAIN if mmap_sem have been + * Return: number of pages mapped on success, -EAGAIN if mmap_sem have been * drop and you need to try again, some other error value otherwise * * Note same usage pattern as hmm_range_fault(). @@ -1272,7 +1228,7 @@ EXPORT_SYMBOL(hmm_range_dma_map); * @device: device against which dma map was done * @daddrs: dma address of mapped pages * @dirty: dirty page if it had the write flag set - * Returns: number of page unmapped on success, -EINVAL otherwise + * Return: number of page unmapped on success, -EINVAL otherwise * * Note that caller MUST abide by mmu notifier or use HMM mirror and abide * to the sync_cpu_device_pagetables() callback so that it is safe here to @@ -1328,284 +1284,3 @@ long hmm_range_dma_unmap(struct hmm_range *range, return cpages; } EXPORT_SYMBOL(hmm_range_dma_unmap); -#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ - - -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, - unsigned long addr) -{ - struct page *page; - - page = alloc_page_vma(GFP_HIGHUSER, vma, addr); - if (!page) - return NULL; - lock_page(page); - return page; -} -EXPORT_SYMBOL(hmm_vma_alloc_locked_page); - - -static void hmm_devmem_ref_release(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - complete(&devmem->completion); -} - -static void hmm_devmem_ref_exit(struct percpu_ref *ref) -{ - struct hmm_devmem *devmem; - - devmem = container_of(ref, struct hmm_devmem, ref); - wait_for_completion(&devmem->completion); - percpu_ref_exit(ref); -} - -static void hmm_devmem_ref_kill(struct percpu_ref *ref) -{ - percpu_ref_kill(ref); -} - -static vm_fault_t hmm_devmem_fault(struct vm_area_struct *vma, - unsigned long addr, - const struct page *page, - unsigned int flags, - pmd_t *pmdp) -{ - struct hmm_devmem *devmem = page->pgmap->data; - - return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); -} - -static void hmm_devmem_free(struct page *page, void *data) -{ - struct hmm_devmem *devmem = data; - - page->mapping = NULL; - - devmem->ops->free(devmem, page); -} - -/* - * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory - * - * @ops: memory event device driver callback (see struct hmm_devmem_ops) - * @device: device struct to bind the resource too - * @size: size in bytes of the device memory to add - * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise - * - * This function first finds an empty range of physical address big enough to - * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which - * in turn allocates struct pages. It does not do anything beyond that; all - * events affecting the memory will go through the various callbacks provided - * by hmm_devmem_ops struct. - * - * Device driver should call this function during device initialization and - * is then responsible of memory management. HMM only provides helpers. - */ -struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, - struct device *device, - unsigned long size) -{ - struct hmm_devmem *devmem; - resource_size_t addr; - void *result; - int ret; - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = NULL; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - size = ALIGN(size, PA_SECTION_SIZE); - addr = min((unsigned long)iomem_resource.end, - (1UL << MAX_PHYSMEM_BITS) - 1); - addr = addr - size + 1UL; - - /* - * FIXME add a new helper to quickly walk resource tree and find free - * range - * - * FIXME what about ioport_resource resource ? - */ - for (; addr > size && addr >= iomem_resource.start; addr -= size) { - ret = region_intersects(addr, size, 0, IORES_DESC_NONE); - if (ret != REGION_DISJOINT) - continue; - - devmem->resource = devm_request_mem_region(device, addr, size, - dev_name(device)); - if (!devmem->resource) - return ERR_PTR(-ENOMEM); - break; - } - if (!devmem->resource) - return ERR_PTR(-ERANGE); - - devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add); - -struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, - struct device *device, - struct resource *res) -{ - struct hmm_devmem *devmem; - void *result; - int ret; - - if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) - return ERR_PTR(-EINVAL); - - dev_pagemap_get_ops(); - - devmem = devm_kzalloc(device, sizeof(*devmem), GFP_KERNEL); - if (!devmem) - return ERR_PTR(-ENOMEM); - - init_completion(&devmem->completion); - devmem->pfn_first = -1UL; - devmem->pfn_last = -1UL; - devmem->resource = res; - devmem->device = device; - devmem->ops = ops; - - ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, - 0, GFP_KERNEL); - if (ret) - return ERR_PTR(ret); - - devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; - devmem->pfn_last = devmem->pfn_first + - (resource_size(devmem->resource) >> PAGE_SHIFT); - devmem->page_fault = hmm_devmem_fault; - - devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; - devmem->pagemap.res = *devmem->resource; - devmem->pagemap.page_free = hmm_devmem_free; - devmem->pagemap.altmap_valid = false; - devmem->pagemap.ref = &devmem->ref; - devmem->pagemap.data = devmem; - devmem->pagemap.kill = hmm_devmem_ref_kill; - devmem->pagemap.cleanup = hmm_devmem_ref_exit; - - result = devm_memremap_pages(devmem->device, &devmem->pagemap); - if (IS_ERR(result)) - return result; - return devmem; -} -EXPORT_SYMBOL_GPL(hmm_devmem_add_resource); - -/* - * A device driver that wants to handle multiple devices memory through a - * single fake device can use hmm_device to do so. This is purely a helper - * and it is not needed to make use of any HMM functionality. - */ -#define HMM_DEVICE_MAX 256 - -static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); -static DEFINE_SPINLOCK(hmm_device_lock); -static struct class *hmm_device_class; -static dev_t hmm_device_devt; - -static void hmm_device_release(struct device *device) -{ - struct hmm_device *hmm_device; - - hmm_device = container_of(device, struct hmm_device, device); - spin_lock(&hmm_device_lock); - clear_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - kfree(hmm_device); -} - -struct hmm_device *hmm_device_new(void *drvdata) -{ - struct hmm_device *hmm_device; - - hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); - if (!hmm_device) - return ERR_PTR(-ENOMEM); - - spin_lock(&hmm_device_lock); - hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); - if (hmm_device->minor >= HMM_DEVICE_MAX) { - spin_unlock(&hmm_device_lock); - kfree(hmm_device); - return ERR_PTR(-EBUSY); - } - set_bit(hmm_device->minor, hmm_device_mask); - spin_unlock(&hmm_device_lock); - - dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); - hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), - hmm_device->minor); - hmm_device->device.release = hmm_device_release; - dev_set_drvdata(&hmm_device->device, drvdata); - hmm_device->device.class = hmm_device_class; - device_initialize(&hmm_device->device); - - return hmm_device; -} -EXPORT_SYMBOL(hmm_device_new); - -void hmm_device_put(struct hmm_device *hmm_device) -{ - put_device(&hmm_device->device); -} -EXPORT_SYMBOL(hmm_device_put); - -static int __init hmm_init(void) -{ - int ret; - - ret = alloc_chrdev_region(&hmm_device_devt, 0, - HMM_DEVICE_MAX, - "hmm_device"); - if (ret) - return ret; - - hmm_device_class = class_create(THIS_MODULE, "hmm_device"); - if (IS_ERR(hmm_device_class)) { - unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); - return PTR_ERR(hmm_device_class); - } - return 0; -} - -device_initcall(hmm_init); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 885642c82aaa..1334ede667a8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -63,10 +63,15 @@ struct page *huge_zero_page __read_mostly; bool transparent_hugepage_enabled(struct vm_area_struct *vma) { + /* The addr is used to check if the vma size fits */ + unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE; + + if (!transhuge_vma_suitable(vma, addr)) + return false; if (vma_is_anonymous(vma)) return __transparent_hugepage_enabled(vma); - if (vma_is_shmem(vma) && shmem_huge_enabled(vma)) - return __transparent_hugepage_enabled(vma); + if (vma_is_shmem(vma)) + return shmem_huge_enabled(vma); return false; } @@ -689,7 +694,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) + if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; diff --git a/mm/maccess.c b/mm/maccess.c index 482d4d670f19..d065736f6b87 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -6,8 +6,20 @@ #include <linux/mm.h> #include <linux/uaccess.h> +static __always_inline long +probe_read_common(void *dst, const void __user *src, size_t size) +{ + long ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dst, src, size); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + /** - * probe_kernel_read(): safely attempt to read from a location + * probe_kernel_read(): safely attempt to read from a kernel-space location * @dst: pointer to the buffer that shall take the data * @src: address to read from * @size: size of the data chunk @@ -30,17 +42,41 @@ long __probe_kernel_read(void *dst, const void *src, size_t size) mm_segment_t old_fs = get_fs(); set_fs(KERNEL_DS); - pagefault_disable(); - ret = __copy_from_user_inatomic(dst, - (__force const void __user *)src, size); - pagefault_enable(); + ret = probe_read_common(dst, (__force const void __user *)src, size); set_fs(old_fs); - return ret ? -EFAULT : 0; + return ret; } EXPORT_SYMBOL_GPL(probe_kernel_read); /** + * probe_user_read(): safely attempt to read from a user-space location + * @dst: pointer to the buffer that shall take the data + * @src: address to read from. This must be a user address. + * @size: size of the data chunk + * + * Safely read from user address @src to the buffer at @dst. If a kernel fault + * happens, handle that and return -EFAULT. + */ + +long __weak probe_user_read(void *dst, const void __user *src, size_t size) + __attribute__((alias("__probe_user_read"))); + +long __probe_user_read(void *dst, const void __user *src, size_t size) +{ + long ret = -EFAULT; + mm_segment_t old_fs = get_fs(); + + set_fs(USER_DS); + if (access_ok(src, size)) + ret = probe_read_common(dst, src, size); + set_fs(old_fs); + + return ret; +} +EXPORT_SYMBOL_GPL(probe_user_read); + +/** * probe_kernel_write(): safely attempt to write to a location * @dst: address to write to * @src: pointer to the data that shall be written @@ -67,6 +103,7 @@ long __probe_kernel_write(void *dst, const void *src, size_t size) } EXPORT_SYMBOL_GPL(probe_kernel_write); + /** * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address. * @dst: Destination address, in kernel space. This buffer must be at @@ -106,3 +143,76 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) return ret ? -EFAULT : src - unsafe_addr; } + +/** + * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user + * address. + * @dst: Destination address, in kernel space. This buffer must be at + * least @count bytes long. + * @unsafe_addr: Unsafe user address. + * @count: Maximum number of bytes to copy, including the trailing NUL. + * + * Copies a NUL-terminated string from unsafe user address to kernel buffer. + * + * On success, returns the length of the string INCLUDING the trailing NUL. + * + * If access fails, returns -EFAULT (some data may have been copied + * and the trailing NUL added). + * + * If @count is smaller than the length of the string, copies @count-1 bytes, + * sets the last byte of @dst buffer to NUL and returns @count. + */ +long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr, + long count) +{ + mm_segment_t old_fs = get_fs(); + long ret; + + if (unlikely(count <= 0)) + return 0; + + set_fs(USER_DS); + pagefault_disable(); + ret = strncpy_from_user(dst, unsafe_addr, count); + pagefault_enable(); + set_fs(old_fs); + + if (ret >= count) { + ret = count; + dst[ret - 1] = '\0'; + } else if (ret > 0) { + ret++; + } + + return ret; +} + +/** + * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL. + * @unsafe_addr: The string to measure. + * @count: Maximum count (including NUL) + * + * Get the size of a NUL-terminated string in user space without pagefault. + * + * Returns the size of the string INCLUDING the terminating NUL. + * + * If the string is too long, returns a number larger than @count. User + * has to check the return value against "> count". + * On exception (or invalid count), returns 0. + * + * Unlike strnlen_user, this can be used from IRQ handler etc. because + * it disables pagefaults. + */ +long strnlen_unsafe_user(const void __user *unsafe_addr, long count) +{ + mm_segment_t old_fs = get_fs(); + int ret; + + set_fs(USER_DS); + pagefault_disable(); + ret = strnlen_user(unsafe_addr, count); + pagefault_enable(); + set_fs(old_fs); + + return ret; +} diff --git a/mm/madvise.c b/mm/madvise.c index 628022e674a7..968df3aa069f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -354,7 +354,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; } - page = _vm_normal_page(vma, addr, ptent, true); + page = vm_normal_page(vma, addr, ptent); if (!page) continue; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f05735b02d3..cdbb7a84cb6e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -695,12 +695,15 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return; - __this_cpu_add(memcg->vmstats_local->stat[idx], val); - x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; + /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(memcg->vmstats_local->stat[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmstats[idx]); x = 0; @@ -749,13 +752,15 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg */ __mod_memcg_state(memcg, idx, val); - /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], val); - x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi; + /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], x); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -777,12 +782,15 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; - __this_cpu_add(memcg->vmstats_local->events[idx], count); - x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]); if (unlikely(x > MEMCG_CHARGE_BATCH)) { struct mem_cgroup *mi; + /* + * Batch local counters to keep them in sync with + * the hierarchical ones. + */ + __this_cpu_add(memcg->vmstats_local->events[idx], x); for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) atomic_long_add(x, &mi->vmevents[idx]); x = 0; @@ -4908,7 +4916,7 @@ enum mc_target_type { static struct page *mc_handle_present_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { - struct page *page = _vm_normal_page(vma, addr, ptent, true); + struct page *page = vm_normal_page(vma, addr, ptent); if (!page || !page_mapped(page)) return NULL; @@ -5109,8 +5117,8 @@ out: * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a * target for charge migration. if @target is not NULL, the entry is stored * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC - * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE + * (so ZONE_DEVICE page and thus not on the lru). * For now we such page is charge like a regular page would be as for all * intent and purposes it is just special memory taking the place of a * regular page. @@ -5144,8 +5152,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, */ if (page->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; - if (is_device_private_page(page) || - is_device_public_page(page)) + if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; if (target) target->page = page; @@ -5216,8 +5223,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, if (ptl) { /* * Note their can not be MC_TARGET_DEVICE for now as we do not - * support transparent huge page with MEMORY_DEVICE_PUBLIC or - * MEMORY_DEVICE_PRIVATE but this might change. + * support transparent huge page with MEMORY_DEVICE_PRIVATE but + * this might change. */ if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7e08cbf3ba49..7ef849da8278 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1177,16 +1177,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, goto unlock; } - switch (pgmap->type) { - case MEMORY_DEVICE_PRIVATE: - case MEMORY_DEVICE_PUBLIC: + if (pgmap->type == MEMORY_DEVICE_PRIVATE) { /* * TODO: Handle HMM pages which may need coordination * with device-side memory. */ goto unlock; - default: - break; } /* diff --git a/mm/memory.c b/mm/memory.c index 53bd59579861..e2bb51b6242e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -571,8 +571,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, - pte_t pte, bool with_public_device) +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { unsigned long pfn = pte_pfn(pte); @@ -585,29 +585,6 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; if (is_zero_pfn(pfn)) return NULL; - - /* - * Device public pages are special pages (they are ZONE_DEVICE - * pages but different from persistent memory). They behave - * allmost like normal pages. The difference is that they are - * not on the lru and thus should never be involve with any- - * thing that involve lru manipulation (mlock, numa balancing, - * ...). - * - * This is why we still want to return NULL for such page from - * vm_normal_page() so that we do not have to special case all - * call site of vm_normal_page(). - */ - if (likely(pfn <= highest_memmap_pfn)) { - struct page *page = pfn_to_page(pfn); - - if (is_device_public_page(page)) { - if (with_public_device) - return page; - return NULL; - } - } - if (pte_devmap(pte)) return NULL; @@ -797,17 +774,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, rss[mm_counter(page)]++; } else if (pte_devmap(pte)) { page = pte_page(pte); - - /* - * Cache coherent device memory behave like regular page and - * not like persistent memory page. For more informations see - * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h - */ - if (is_device_public_page(page)) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } } out_set_pte: @@ -1063,7 +1029,7 @@ again: if (pte_present(ptent)) { struct page *page; - page = _vm_normal_page(vma, addr, ptent, true); + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -2777,13 +2743,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { - /* - * For un-addressable device memory we call the pgmap - * fault handler callback. The callback must migrate - * the page back to some CPU accessible page. - */ - ret = device_private_entry_fault(vma, vmf->address, entry, - vmf->flags, vmf->pmd); + vmf->page = device_private_entry_to_page(entry); + ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { @@ -3201,19 +3162,6 @@ map_pte: } #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE - -#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1) -static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, - unsigned long haddr) -{ - if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) != - (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK)) - return false; - if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) - return false; - return true; -} - static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e096c987d261..2a9bbddb0e55 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -166,9 +166,10 @@ void put_page_bootmem(struct page *page) #ifndef CONFIG_SPARSEMEM_VMEMMAP static void register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long *usemap, mapsize, section_nr, i; + unsigned long mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; + struct mem_section_usage *usage; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -188,10 +189,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = ms->pageblock_flags; - page = virt_to_page(usemap); + usage = ms->usage; + page = virt_to_page(usage); - mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); @@ -200,9 +201,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) #else /* CONFIG_SPARSEMEM_VMEMMAP */ static void register_page_bootmem_info_section(unsigned long start_pfn) { - unsigned long *usemap, mapsize, section_nr, i; + unsigned long mapsize, section_nr, i; struct mem_section *ms; struct page *page, *memmap; + struct mem_section_usage *usage; section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); @@ -211,10 +213,10 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = ms->pageblock_flags; - page = virt_to_page(usemap); + usage = ms->usage; + page = virt_to_page(usage); - mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; + mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT; for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, MIX_SECTION_INFO); @@ -250,22 +252,31 @@ void __init register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static int __meminit __add_section(int nid, unsigned long phys_start_pfn, - struct vmem_altmap *altmap, bool want_memblock) +static int check_pfn_span(unsigned long pfn, unsigned long nr_pages, + const char *reason) { - int ret; - - if (pfn_valid(phys_start_pfn)) - return -EEXIST; - - ret = sparse_add_one_section(nid, phys_start_pfn, altmap); - if (ret < 0) - return ret; - - if (!want_memblock) - return 0; - - return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn)); + /* + * Disallow all operations smaller than a sub-section and only + * allow operations smaller than a section for + * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range() + * enforces a larger memory_block_size_bytes() granularity for + * memory that will be marked online, so this check should only + * fire for direct arch_{add,remove}_memory() users outside of + * add_memory_resource(). + */ + unsigned long min_align; + + if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) + min_align = PAGES_PER_SUBSECTION; + else + min_align = PAGES_PER_SECTION; + if (!IS_ALIGNED(pfn, min_align) + || !IS_ALIGNED(nr_pages, min_align)) { + WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n", + reason, pfn, pfn + nr_pages - 1); + return -EINVAL; + } + return 0; } /* @@ -274,62 +285,54 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * call this function after deciding the zone to which to * add the new pages. */ -int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, struct mhp_restrictions *restrictions) +int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages, + struct mhp_restrictions *restrictions) { - unsigned long i; - int err = 0; - int start_sec, end_sec; + int err; + unsigned long nr, start_sec, end_sec; struct vmem_altmap *altmap = restrictions->altmap; - /* during initialize mem_map, align hot-added range to section */ - start_sec = pfn_to_section_nr(phys_start_pfn); - end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - if (altmap) { /* * Validate altmap is within bounds of the total request */ - if (altmap->base_pfn != phys_start_pfn + if (altmap->base_pfn != pfn || vmem_altmap_offset(altmap) > nr_pages) { pr_warn_once("memory add fail, invalid altmap\n"); - err = -EINVAL; - goto out; + return -EINVAL; } altmap->alloc = 0; } - for (i = start_sec; i <= end_sec; i++) { - err = __add_section(nid, section_nr_to_pfn(i), altmap, - restrictions->flags & MHP_MEMBLOCK_API); + err = check_pfn_span(pfn, nr_pages, "add"); + if (err) + return err; - /* - * EEXIST is finally dealt with by ioresource collision - * check. see add_memory() => register_memory_resource() - * Warning will be printed if there is collision. - */ - if (err && (err != -EEXIST)) + start_sec = pfn_to_section_nr(pfn); + end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + for (nr = start_sec; nr <= end_sec; nr++) { + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + err = sparse_add_section(nid, pfn, pfns, altmap); + if (err) break; - err = 0; + pfn += pfns; + nr_pages -= pfns; cond_resched(); } vmemmap_populate_print_last(); -out: return err; } -#ifdef CONFIG_MEMORY_HOTREMOVE /* find the smallest valid pfn in the range [start_pfn, end_pfn) */ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; - - for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(start_pfn); - - if (unlikely(!valid_section(ms))) + for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(start_pfn))) continue; if (unlikely(pfn_to_nid(start_pfn) != nid)) @@ -349,15 +352,12 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - struct mem_section *ms; unsigned long pfn; /* pfn is the end pfn of a memory section. */ pfn = end_pfn - 1; - for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(pfn))) continue; if (unlikely(pfn_to_nid(pfn) != nid)) @@ -379,7 +379,6 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ unsigned long zone_end_pfn = z; unsigned long pfn; - struct mem_section *ms; int nid = zone_to_nid(zone); zone_span_writelock(zone); @@ -416,17 +415,15 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, * it check the zone has only hole or not. */ pfn = zone_start_pfn; - for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(pfn))) continue; if (page_zone(pfn_to_page(pfn)) != zone) continue; - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) + /* Skip range to be removed */ + if (pfn >= start_pfn && pfn < end_pfn) continue; /* If we find valid section, we have nothing to do */ @@ -447,7 +444,6 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ unsigned long pgdat_end_pfn = p; unsigned long pfn; - struct mem_section *ms; int nid = pgdat->node_id; if (pgdat_start_pfn == start_pfn) { @@ -484,17 +480,15 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, * has only hole or not. */ pfn = pgdat_start_pfn; - for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) { - ms = __pfn_to_section(pfn); - - if (unlikely(!valid_section(ms))) + for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SUBSECTION) { + if (unlikely(!pfn_valid(pfn))) continue; if (pfn_to_nid(pfn) != nid) continue; - /* If the section is current section, it continues the loop */ - if (start_pfn == pfn) + /* Skip range to be removed */ + if (pfn >= start_pfn && pfn < end_pfn) continue; /* If we find valid section, we have nothing to do */ @@ -506,10 +500,10 @@ static void shrink_pgdat_span(struct pglist_data *pgdat, pgdat->node_spanned_pages = 0; } -static void __remove_zone(struct zone *zone, unsigned long start_pfn) +static void __remove_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages) { struct pglist_data *pgdat = zone->zone_pgdat; - int nr_pages = PAGES_PER_SECTION; unsigned long flags; pgdat_resize_lock(zone->zone_pgdat, &flags); @@ -518,29 +512,23 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static void __remove_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, - struct vmem_altmap *altmap) +static void __remove_section(struct zone *zone, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, + struct vmem_altmap *altmap) { - unsigned long start_pfn; - int scn_nr; + struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn)); if (WARN_ON_ONCE(!valid_section(ms))) return; - unregister_memory_section(ms); - - scn_nr = __section_nr(ms); - start_pfn = section_nr_to_pfn((unsigned long)scn_nr); - __remove_zone(zone, start_pfn); - - sparse_remove_one_section(zone, ms, map_offset, altmap); + __remove_zone(zone, pfn, nr_pages); + sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap); } /** * __remove_pages() - remove sections of pages from a zone * @zone: zone from which pages need to be removed - * @phys_start_pfn: starting pageframe (must be aligned to start of a section) + * @pfn: starting pageframe (must be aligned to start of a section) * @nr_pages: number of pages to remove (must be multiple of section size) * @altmap: alternative device page map or %NULL if default memmap is used * @@ -549,40 +537,35 @@ static void __remove_section(struct zone *zone, struct mem_section *ms, * sure that pages are marked reserved and zones are adjust properly by * calling offline_pages(). */ -void __remove_pages(struct zone *zone, unsigned long phys_start_pfn, +void __remove_pages(struct zone *zone, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long i; unsigned long map_offset = 0; - int sections_to_remove; + unsigned long nr, start_sec, end_sec; - /* In the ZONE_DEVICE case device driver owns the memory region */ - if (is_dev_zone(zone)) { - if (altmap) - map_offset = vmem_altmap_offset(altmap); - } + map_offset = vmem_altmap_offset(altmap); clear_zone_contiguous(zone); - /* - * We can only remove entire sections - */ - BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); - BUG_ON(nr_pages % PAGES_PER_SECTION); + if (check_pfn_span(pfn, nr_pages, "remove")) + return; - sections_to_remove = nr_pages / PAGES_PER_SECTION; - for (i = 0; i < sections_to_remove; i++) { - unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + start_sec = pfn_to_section_nr(pfn); + end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + for (nr = start_sec; nr <= end_sec; nr++) { + unsigned long pfns; cond_resched(); - __remove_section(zone, __pfn_to_section(pfn), map_offset, - altmap); + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + __remove_section(zone, pfn, pfns, map_offset, altmap); + pfn += pfns; + nr_pages -= pfns; map_offset = 0; } set_zone_contiguous(zone); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ int set_online_page_callback(online_page_callback_t callback) { @@ -1051,16 +1034,11 @@ int try_online_node(int nid) static int check_hotplug_memory_range(u64 start, u64 size) { - unsigned long block_sz = memory_block_size_bytes(); - u64 block_nr_pages = block_sz >> PAGE_SHIFT; - u64 nr_pages = size >> PAGE_SHIFT; - u64 start_pfn = PFN_DOWN(start); - /* memory range must be block size aligned */ - if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) || - !IS_ALIGNED(nr_pages, block_nr_pages)) { + if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) || + !IS_ALIGNED(size, memory_block_size_bytes())) { pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx", - block_sz, start, size); + memory_block_size_bytes(), start, size); return -EINVAL; } @@ -1080,9 +1058,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) */ int __ref add_memory_resource(int nid, struct resource *res) { - struct mhp_restrictions restrictions = { - .flags = MHP_MEMBLOCK_API, - }; + struct mhp_restrictions restrictions = {}; u64 start, size; bool new_node = false; int ret; @@ -1114,6 +1090,13 @@ int __ref add_memory_resource(int nid, struct resource *res) if (ret < 0) goto error; + /* create memory block devices after memory was added */ + ret = create_memory_block_devices(start, size); + if (ret) { + arch_remove_memory(nid, start, size, NULL); + goto error; + } + if (new_node) { /* If sysfs file of new node can't be created, cpu on the node * can't be hot-added. There is no rollback way now. @@ -1137,8 +1120,7 @@ int __ref add_memory_resource(int nid, struct resource *res) /* online pages if requested */ if (memhp_auto_online) - walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), - NULL, online_memory_block); + walk_memory_blocks(start, size, NULL, online_memory_block); return ret; error: @@ -1673,58 +1655,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ - -/** - * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) - * @start_pfn: start pfn of the memory range - * @end_pfn: end pfn of the memory range - * @arg: argument passed to func - * @func: callback for each memory section walked - * - * This function walks through all present mem sections in range - * [start_pfn, end_pfn) and call func on each mem section. - * - * Returns the return value of func. - */ -int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, - void *arg, int (*func)(struct memory_block *, void *)) -{ - struct memory_block *mem = NULL; - struct mem_section *section; - unsigned long pfn, section_nr; - int ret; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - section_nr = pfn_to_section_nr(pfn); - if (!present_section_nr(section_nr)) - continue; - - section = __nr_to_section(section_nr); - /* same memblock? */ - if (mem) - if ((section_nr >= mem->start_section_nr) && - (section_nr <= mem->end_section_nr)) - continue; - - mem = find_memory_block_hinted(section, mem); - if (!mem) - continue; - - ret = func(mem, arg); - if (ret) { - kobject_put(&mem->dev.kobj); - return ret; - } - } - - if (mem) - kobject_put(&mem->dev.kobj); - - return 0; -} -#ifdef CONFIG_MEMORY_HOTREMOVE static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1736,9 +1667,10 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1; pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n", &beginpa, &endpa); - } - return ret; + return -EBUSY; + } + return 0; } static int check_cpu_on_node(pg_data_t *pgdat) @@ -1821,19 +1753,9 @@ static void __release_memory_resource(resource_size_t start, } } -/** - * remove_memory - * @nid: the node ID - * @start: physical address of the region to remove - * @size: size of the region to remove - * - * NOTE: The caller must call lock_device_hotplug() to serialize hotplug - * and online/offline operations before this call, as required by - * try_offline_node(). - */ -void __ref __remove_memory(int nid, u64 start, u64 size) +static int __ref try_remove_memory(int nid, u64 start, u64 size) { - int ret; + int rc = 0; BUG_ON(check_hotplug_memory_range(start, size)); @@ -1841,32 +1763,65 @@ void __ref __remove_memory(int nid, u64 start, u64 size) /* * All memory blocks must be offlined before removing memory. Check - * whether all memory blocks in question are offline and trigger a BUG() + * whether all memory blocks in question are offline and return error * if this is not the case. */ - ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - check_memblock_offlined_cb); - if (ret) - BUG(); + rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb); + if (rc) + goto done; /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); memblock_free(start, size); memblock_remove(start, size); + /* remove memory block devices before removing memory */ + remove_memory_block_devices(start, size); + arch_remove_memory(nid, start, size, NULL); __release_memory_resource(start, size); try_offline_node(nid); +done: mem_hotplug_done(); + return rc; } -void remove_memory(int nid, u64 start, u64 size) +/** + * remove_memory + * @nid: the node ID + * @start: physical address of the region to remove + * @size: size of the region to remove + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ +void __remove_memory(int nid, u64 start, u64 size) { + + /* + * trigger BUG() is some memory is not offlined prior to calling this + * function + */ + if (try_remove_memory(nid, start, size)) + BUG(); +} + +/* + * Remove memory if every memory block is offline, otherwise return -EBUSY is + * some memory is not offline + */ +int remove_memory(int nid, u64 start, u64 size) +{ + int rc; + lock_device_hotplug(); - __remove_memory(nid, start, size); + rc = try_remove_memory(nid, start, size); unlock_device_hotplug(); + + return rc; } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index fdcb73536319..f48693f75b37 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2098,6 +2098,7 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, out: return page; } +EXPORT_SYMBOL(alloc_pages_vma); /** * alloc_pages_current - Allocate pages. diff --git a/mm/migrate.c b/mm/migrate.c index e9594bc0d406..8992741f10aa 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -246,8 +246,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (is_device_private_page(new)) { entry = make_device_private_entry(new, pte_write(pte)); pte = swp_entry_to_pte(entry); - } else if (is_device_public_page(new)) { - pte = pte_mkdevmap(pte); } } @@ -381,7 +379,6 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * ZONE_DEVICE pages. */ expected_count += is_device_private_page(page); - expected_count += is_device_public_page(page); if (mapping) expected_count += hpage_nr_pages(page) + page_has_private(page); @@ -397,8 +394,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ int migrate_page_move_mapping(struct address_space *mapping, - struct page *newpage, struct page *page, enum migrate_mode mode, - int extra_count) + struct page *newpage, struct page *page, int extra_count) { XA_STATE(xas, &mapping->i_pages, page_index(page)); struct zone *oldzone, *newzone; @@ -684,7 +680,7 @@ int migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) return rc; @@ -783,7 +779,7 @@ recheck_buffers: } } - rc = migrate_page_move_mapping(mapping, newpage, page, mode, 0); + rc = migrate_page_move_mapping(mapping, newpage, page, 0); if (rc != MIGRATEPAGE_SUCCESS) goto unlock_buffers; @@ -994,10 +990,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, if (!PageMappingFlags(page)) page->mapping = NULL; - if (unlikely(is_zone_device_page(newpage))) { - if (is_device_public_page(newpage)) - flush_dcache_page(newpage); - } else + if (likely(!is_zone_device_page(newpage))) flush_dcache_page(newpage); } @@ -2265,7 +2258,7 @@ again: pfn = 0; goto next; } - page = _vm_normal_page(migrate->vma, addr, pte, true); + page = vm_normal_page(migrate->vma, addr, pte); mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0; } @@ -2406,16 +2399,7 @@ static bool migrate_vma_check_page(struct page *page) * FIXME proper solution is to rework migration_entry_wait() so * it does not need to take a reference on page. */ - if (is_device_private_page(page)) - return true; - - /* - * Only allow device public page to be migrated and account for - * the extra reference count imply by ZONE_DEVICE pages. - */ - if (!is_device_public_page(page)) - return false; - extra++; + return is_device_private_page(page); } /* For file back page */ @@ -2665,11 +2649,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE); entry = swp_entry_to_pte(swp_entry); - } else if (is_device_public_page(page)) { - entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot))); - if (vma->vm_flags & VM_WRITE) - entry = pte_mkwrite(pte_mkdirty(entry)); - entry = pte_mkdevmap(entry); } } else { entry = mk_pte(page, vma->vm_page_prot); @@ -2789,7 +2768,7 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - } else if (!is_device_public_page(newpage)) { + } else { /* * Other types of ZONE_DEVICE page are not * supported. diff --git a/mm/nommu.c b/mm/nommu.c index eb3e2e558da1..fed1b6e9c89b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1261,7 +1261,9 @@ unsigned long do_mmap(struct file *file, add_nommu_region(region); /* clear anonymous mappings that don't ask for uninitialized data */ - if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + if (!vma->vm_file && + (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) || + !(flags & MAP_UNINITIALIZED))) memset((void *)region->vm_start, 0, region->vm_end - region->vm_start); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dbd0d5cbbcbb..272c6de1bf4e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -450,7 +450,7 @@ static inline unsigned long *get_pageblock_bitmap(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; + return section_to_usemap(__pfn_to_section(pfn)); #else return page_zone(page)->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ @@ -4102,7 +4102,6 @@ static int __perform_reclaim(gfp_t gfp_mask, unsigned int order, const struct alloc_context *ac) { - struct reclaim_state reclaim_state; int progress; unsigned int noreclaim_flag; unsigned long pflags; @@ -4114,13 +4113,10 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, psi_memstall_enter(&pflags); fs_reclaim_acquire(gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - reclaim_state.reclaimed_slab = 0; - current->reclaim_state = &reclaim_state; progress = try_to_free_pages(ac->zonelist, order, gfp_mask, ac->nodemask); - current->reclaim_state = NULL; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(gfp_mask); psi_memstall_leave(&pflags); @@ -5925,11 +5921,12 @@ void __ref memmap_init_zone_device(struct zone *zone, { unsigned long pfn, end_pfn = start_pfn + size; struct pglist_data *pgdat = zone->zone_pgdat; + struct vmem_altmap *altmap = pgmap_altmap(pgmap); unsigned long zone_idx = zone_idx(zone); unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || !is_dev_zone(zone))) + if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) return; /* @@ -5937,9 +5934,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * of the pages reserved for the memmap, so we can just jump to * the end of that region and start processing the device pages. */ - if (pgmap->altmap_valid) { - struct vmem_altmap *altmap = &pgmap->altmap; - + if (altmap) { start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap); size = end_pfn - start_pfn; } @@ -5959,12 +5954,12 @@ void __ref memmap_init_zone_device(struct zone *zone, __SetPageReserved(page); /* - * ZONE_DEVICE pages union ->lru with a ->pgmap back - * pointer and hmm_data. It is a bug if a ZONE_DEVICE - * page is ever freed or placed on a driver-private list. + * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer + * and zone_device_data. It is a bug if a ZONE_DEVICE page is + * ever freed or placed on a driver-private list. */ page->pgmap = pgmap; - page->hmm_data = 0; + page->zone_device_data = NULL; /* * Mark the block movable so that blocks are reserved for @@ -5979,7 +5974,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * pfn out of zone. * * Please note that MEMMAP_HOTPLUG path doesn't clear memmap - * because this is done early in sparse_add_one_section + * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); @@ -7356,12 +7351,18 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) (u64)zone_movable_pfn[i] << PAGE_SHIFT); } - /* Print out the early node map */ + /* + * Print out the early node map, and initialize the + * subsection-map relative to active online memory ranges to + * enable future "sub-section" extensions of the memory map. + */ pr_info("Early memory node ranges\n"); - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) { pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid, (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1); + subsection_map_init(start_pfn, end_pfn - start_pfn); + } /* Initialise every node */ mminit_verify_pageflags_layout(); diff --git a/mm/shmem.c b/mm/shmem.c index f4dce9c8670d..626d8c74b973 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -400,7 +400,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly; -#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) +#if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -417,7 +417,9 @@ static int shmem_parse_huge(const char *str) return SHMEM_HUGE_FORCE; return -EINVAL; } +#endif +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) { switch (huge) { @@ -3775,10 +3777,6 @@ int __init shmem_init(void) { int error; - /* If rootfs called this, don't re-init */ - if (shmem_inode_cachep) - return 0; - shmem_init_inodecache(); error = register_filesystem(&shmem_fs_type); @@ -3872,6 +3870,9 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) loff_t i_size; pgoff_t off; + if ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) + return false; if (shmem_huge == SHMEM_HUGE_FORCE) return true; if (shmem_huge == SHMEM_HUGE_DENY) diff --git a/mm/slab_common.c b/mm/slab_common.c index 6c49dbb3769e..807490fe217a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1028,7 +1028,8 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, } struct kmem_cache * -kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init; +kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = +{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ }; EXPORT_SYMBOL(kmalloc_caches); /* diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 7fec05796796..200aef686722 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -245,19 +245,26 @@ int __meminit vmemmap_populate_basepages(unsigned long start, return 0; } -struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +struct page * __meminit __populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { unsigned long start; unsigned long end; - struct page *map; - map = pfn_to_page(pnum * PAGES_PER_SECTION); - start = (unsigned long)map; - end = (unsigned long)(map + PAGES_PER_SECTION); + /* + * The minimum granularity of memmap extensions is + * PAGES_PER_SUBSECTION as allocations are tracked in the + * 'subsection_map' bitmap of the section. + */ + end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION); + pfn &= PAGE_SUBSECTION_MASK; + nr_pages = end - pfn; + + start = (unsigned long) pfn_to_page(pfn); + end = start + nr_pages * sizeof(struct page); if (vmemmap_populate(start, end, nid, altmap)) return NULL; - return map; + return pfn_to_page(pfn); } diff --git a/mm/sparse.c b/mm/sparse.c index fd13166949b5..72f010d9bff5 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -83,8 +83,15 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; + /* + * An existing section is possible in the sub-section hotplug + * case. First hot-add instantiates, follow-on hot-add reuses + * the existing section. + * + * The mem_hotplug_lock resolves the apparent race below. + */ if (mem_section[root]) - return -EEXIST; + return 0; section = sparse_index_alloc(nid); if (!section) @@ -102,7 +109,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid) #endif #ifdef CONFIG_SPARSEMEM_EXTREME -int __section_nr(struct mem_section* ms) +unsigned long __section_nr(struct mem_section *ms) { unsigned long root_nr; struct mem_section *root = NULL; @@ -121,9 +128,9 @@ int __section_nr(struct mem_section* ms) return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } #else -int __section_nr(struct mem_section* ms) +unsigned long __section_nr(struct mem_section *ms) { - return (int)(ms - mem_section[0]); + return (unsigned long)(ms - mem_section[0]); } #endif @@ -178,10 +185,10 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, * Keeping track of this gives us an easy way to break out of * those loops early. */ -int __highest_present_section_nr; +unsigned long __highest_present_section_nr; static void section_mark_present(struct mem_section *ms) { - int section_nr = __section_nr(ms); + unsigned long section_nr = __section_nr(ms); if (section_nr > __highest_present_section_nr) __highest_present_section_nr = section_nr; @@ -189,7 +196,7 @@ static void section_mark_present(struct mem_section *ms) ms->section_mem_map |= SECTION_MARKED_PRESENT; } -static inline int next_present_section_nr(int section_nr) +static inline unsigned long next_present_section_nr(unsigned long section_nr) { do { section_nr++; @@ -210,6 +217,41 @@ static inline unsigned long first_present_section_nr(void) return next_present_section_nr(-1); } +void subsection_mask_set(unsigned long *map, unsigned long pfn, + unsigned long nr_pages) +{ + int idx = subsection_map_index(pfn); + int end = subsection_map_index(pfn + nr_pages - 1); + + bitmap_set(map, idx, end - idx + 1); +} + +void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages) +{ + int end_sec = pfn_to_section_nr(pfn + nr_pages - 1); + unsigned long nr, start_sec = pfn_to_section_nr(pfn); + + if (!nr_pages) + return; + + for (nr = start_sec; nr <= end_sec; nr++) { + struct mem_section *ms; + unsigned long pfns; + + pfns = min(nr_pages, PAGES_PER_SECTION + - (pfn & ~PAGE_SECTION_MASK)); + ms = __nr_to_section(nr); + subsection_mask_set(ms->usage->subsection_map, pfn, pfns); + + pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr, + pfns, subsection_map_index(pfn), + subsection_map_index(pfn + pfns - 1)); + + pfn += pfns; + nr_pages -= pfns; + } +} + /* Record a memory area against a node. */ void __init memory_present(int nid, unsigned long start, unsigned long end) { @@ -288,33 +330,31 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn static void __meminit sparse_init_one_section(struct mem_section *ms, unsigned long pnum, struct page *mem_map, - unsigned long *pageblock_bitmap) + struct mem_section_usage *usage, unsigned long flags) { ms->section_mem_map &= ~SECTION_MAP_MASK; - ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) | - SECTION_HAS_MEM_MAP; - ms->pageblock_flags = pageblock_bitmap; + ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) + | SECTION_HAS_MEM_MAP | flags; + ms->usage = usage; } -unsigned long usemap_size(void) +static unsigned long usemap_size(void) { return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } -#ifdef CONFIG_MEMORY_HOTPLUG -static unsigned long *__kmalloc_section_usemap(void) +size_t mem_section_usage_size(void) { - return kmalloc(usemap_size(), GFP_KERNEL); + return sizeof(struct mem_section_usage) + usemap_size(); } -#endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTREMOVE -static unsigned long * __init +static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { + struct mem_section_usage *usage; unsigned long goal, limit; - unsigned long *p; int nid; /* * A page may contain usemaps for other sections preventing the @@ -330,15 +370,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: - p = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); - if (!p && limit) { + usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid); + if (!usage && limit) { limit = 0; goto again; } - return p; + return usage; } -static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) { unsigned long usemap_snr, pgdat_snr; static unsigned long old_usemap_snr; @@ -352,7 +393,7 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) old_pgdat_snr = NR_MEM_SECTIONS; } - usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT); + usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) return; @@ -380,14 +421,15 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) usemap_snr, pgdat_snr, nid); } #else -static unsigned long * __init +static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id); } -static void __init check_usemap_section_nr(int nid, unsigned long *usemap) +static void __init check_usemap_section_nr(int nid, + struct mem_section_usage *usage) { } #endif /* CONFIG_MEMORY_HOTREMOVE */ @@ -404,8 +446,8 @@ static unsigned long __init section_map_size(void) return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); } -struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +struct page __init *__populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { unsigned long size = section_map_size(); struct page *map = sparse_buffer_alloc(size); @@ -474,32 +516,35 @@ static void __init sparse_init_nid(int nid, unsigned long pnum_begin, unsigned long pnum_end, unsigned long map_count) { - unsigned long pnum, usemap_longs, *usemap; + struct mem_section_usage *usage; + unsigned long pnum; struct page *map; - usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS); - usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), - usemap_size() * - map_count); - if (!usemap) { + usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid), + mem_section_usage_size() * map_count); + if (!usage) { pr_err("%s: node[%d] usemap allocation failed", __func__, nid); goto failed; } sparse_buffer_init(map_count * section_map_size(), nid); for_each_present_section_nr(pnum_begin, pnum) { + unsigned long pfn = section_nr_to_pfn(pnum); + if (pnum >= pnum_end) break; - map = sparse_mem_map_populate(pnum, nid, NULL); + map = __populate_section_memmap(pfn, PAGES_PER_SECTION, + nid, NULL); if (!map) { pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.", __func__, nid); pnum_begin = pnum; goto failed; } - check_usemap_section_nr(nid, usemap); - sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap); - usemap += usemap_longs; + check_usemap_section_nr(nid, usage); + sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage, + SECTION_IS_EARLY); + usage = (void *) usage + mem_section_usage_size(); } sparse_buffer_fini(); return; @@ -590,21 +635,20 @@ void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn) #endif #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - struct vmem_altmap *altmap) +static struct page *populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { - /* This will make the necessary allocations eventually. */ - return sparse_mem_map_populate(pnum, nid, altmap); + return __populate_section_memmap(pfn, nr_pages, nid, altmap); } -static void __kfree_section_memmap(struct page *memmap, + +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long start = (unsigned long) pfn_to_page(pfn); + unsigned long end = start + nr_pages * sizeof(struct page); vmemmap_free(start, end, altmap); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; @@ -612,9 +656,9 @@ static void free_map_bootmem(struct page *memmap) vmemmap_free(start, end, NULL); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #else -static struct page *__kmalloc_section_memmap(void) +struct page *populate_section_memmap(unsigned long pfn, + unsigned long nr_pages, int nid, struct vmem_altmap *altmap) { struct page *page, *ret; unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; @@ -635,15 +679,11 @@ got_map_ptr: return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, +static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap) { - return __kmalloc_section_memmap(); -} + struct page *memmap = pfn_to_page(pfn); -static void __kfree_section_memmap(struct page *memmap, - struct vmem_altmap *altmap) -{ if (is_vmalloc_addr(memmap)) vfree(memmap); else @@ -651,7 +691,6 @@ static void __kfree_section_memmap(struct page *memmap, get_order(sizeof(struct page) * PAGES_PER_SECTION)); } -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; @@ -681,13 +720,122 @@ static void free_map_bootmem(struct page *memmap) put_page_bootmem(page); } } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_SPARSEMEM_VMEMMAP */ +static void section_deactivate(unsigned long pfn, unsigned long nr_pages, + struct vmem_altmap *altmap) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + bool section_is_early = early_section(ms); + struct page *memmap = NULL; + unsigned long *subsection_map = ms->usage + ? &ms->usage->subsection_map[0] : NULL; + + subsection_mask_set(map, pfn, nr_pages); + if (subsection_map) + bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION); + + if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION), + "section already deactivated (%#lx + %ld)\n", + pfn, nr_pages)) + return; + + /* + * There are 3 cases to handle across two configurations + * (SPARSEMEM_VMEMMAP={y,n}): + * + * 1/ deactivation of a partial hot-added section (only possible + * in the SPARSEMEM_VMEMMAP=y case). + * a/ section was present at memory init + * b/ section was hot-added post memory init + * 2/ deactivation of a complete hot-added section + * 3/ deactivation of a complete section from memory init + * + * For 1/, when subsection_map does not empty we will not be + * freeing the usage map, but still need to free the vmemmap + * range. + * + * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified + */ + bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION); + if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) { + unsigned long section_nr = pfn_to_section_nr(pfn); + + if (!section_is_early) { + kfree(ms->usage); + ms->usage = NULL; + } + memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr); + ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr); + } + + if (section_is_early && memmap) + free_map_bootmem(memmap); + else + depopulate_section_memmap(pfn, nr_pages, altmap); +} + +static struct page * __meminit section_activate(int nid, unsigned long pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) +{ + DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 }; + struct mem_section *ms = __pfn_to_section(pfn); + struct mem_section_usage *usage = NULL; + unsigned long *subsection_map; + struct page *memmap; + int rc = 0; + + subsection_mask_set(map, pfn, nr_pages); + + if (!ms->usage) { + usage = kzalloc(mem_section_usage_size(), GFP_KERNEL); + if (!usage) + return ERR_PTR(-ENOMEM); + ms->usage = usage; + } + subsection_map = &ms->usage->subsection_map[0]; + + if (bitmap_empty(map, SUBSECTIONS_PER_SECTION)) + rc = -EINVAL; + else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION)) + rc = -EEXIST; + else + bitmap_or(subsection_map, map, subsection_map, + SUBSECTIONS_PER_SECTION); + + if (rc) { + if (usage) + ms->usage = NULL; + kfree(usage); + return ERR_PTR(rc); + } + + /* + * The early init code does not consider partially populated + * initial sections, it simply assumes that memory will never be + * referenced. If we hot-add memory into such a section then we + * do not need to populate the memmap and can simply reuse what + * is already there. + */ + if (nr_pages < PAGES_PER_SECTION && early_section(ms)) + return pfn_to_page(pfn); + + memmap = populate_section_memmap(pfn, nr_pages, nid, altmap); + if (!memmap) { + section_deactivate(pfn, nr_pages, altmap); + return ERR_PTR(-ENOMEM); + } + + return memmap; +} + /** - * sparse_add_one_section - add a memory section + * sparse_add_section - add a memory section, or populate an existing one * @nid: The node to add section on * @start_pfn: start pfn of the memory range + * @nr_pages: number of pfns to add in the section * @altmap: device page map * * This is only intended for hotplug. @@ -697,56 +845,40 @@ static void free_map_bootmem(struct page *memmap) * * -EEXIST - Section has been present. * * -ENOMEM - Out of memory. */ -int __meminit sparse_add_one_section(int nid, unsigned long start_pfn, - struct vmem_altmap *altmap) +int __meminit sparse_add_section(int nid, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct mem_section *ms; struct page *memmap; - unsigned long *usemap; int ret; - /* - * no locking for this, because it does its own - * plus, it does a kmalloc - */ ret = sparse_index_init(section_nr, nid); - if (ret < 0 && ret != -EEXIST) + if (ret < 0) return ret; - ret = 0; - memmap = kmalloc_section_memmap(section_nr, nid, altmap); - if (!memmap) - return -ENOMEM; - usemap = __kmalloc_section_usemap(); - if (!usemap) { - __kfree_section_memmap(memmap, altmap); - return -ENOMEM; - } - ms = __pfn_to_section(start_pfn); - if (ms->section_mem_map & SECTION_MARKED_PRESENT) { - ret = -EEXIST; - goto out; - } + memmap = section_activate(nid, start_pfn, nr_pages, altmap); + if (IS_ERR(memmap)) + return PTR_ERR(memmap); /* * Poison uninitialized struct pages in order to catch invalid flags * combinations. */ - page_init_poison(memmap, sizeof(struct page) * PAGES_PER_SECTION); + page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages); + ms = __pfn_to_section(start_pfn); + set_section_nid(section_nr, nid); section_mark_present(ms); - sparse_init_one_section(ms, section_nr, memmap, usemap); -out: - if (ret < 0) { - kfree(usemap); - __kfree_section_memmap(memmap, altmap); - } - return ret; + /* Align memmap to section boundary in the subsection case */ + if (section_nr_to_pfn(section_nr) != start_pfn) + memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr)); + sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0); + + return 0; } -#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -777,51 +909,12 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -static void free_section_usemap(struct page *memmap, unsigned long *usemap, +void sparse_remove_section(struct mem_section *ms, unsigned long pfn, + unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - struct page *usemap_page; - - if (!usemap) - return; - - usemap_page = virt_to_page(usemap); - /* - * Check to see if allocation came from hot-plug-add - */ - if (PageSlab(usemap_page) || PageCompound(usemap_page)) { - kfree(usemap); - if (memmap) - __kfree_section_memmap(memmap, altmap); - return; - } - - /* - * The usemap came from bootmem. This is packed with other usemaps - * on the section which has pgdat at boot time. Just keep it as is now. - */ - - if (memmap) - free_map_bootmem(memmap); -} - -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, - unsigned long map_offset, struct vmem_altmap *altmap) -{ - struct page *memmap = NULL; - unsigned long *usemap = NULL; - - if (ms->section_mem_map) { - usemap = ms->pageblock_flags; - memmap = sparse_decode_mem_map(ms->section_mem_map, - __section_nr(ms)); - ms->section_mem_map = 0; - ms->pageblock_flags = NULL; - } - - clear_hwpoisoned_pages(memmap + map_offset, - PAGES_PER_SECTION - map_offset); - free_section_usemap(memmap, usemap, altmap); + clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, + nr_pages - map_offset); + section_deactivate(pfn, nr_pages, altmap); } -#endif /* CONFIG_MEMORY_HOTREMOVE */ #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index 7ede3eddc12a..ae300397dfda 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -8,7 +8,7 @@ /* * This file contains the default values for the operation of the * Linux VM subsystem. Fine-tuning documentation can be found in - * Documentation/sysctl/vm.txt. + * Documentation/admin-guide/sysctl/vm.rst. * Started 18.12.91 * Swap aging added 23.2.95, Stephen Tweedie. * Buffermem limits added 12.3.98, Rik van Riel. @@ -740,15 +740,20 @@ void release_pages(struct page **pages, int nr) if (is_huge_zero_page(page)) continue; - /* Device public page can not be huge page */ - if (is_device_public_page(page)) { + if (is_zone_device_page(page)) { if (locked_pgdat) { spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); locked_pgdat = NULL; } - put_devmap_managed_page(page); - continue; + /* + * ZONE_DEVICE pages that return 'false' from + * put_devmap_managed_page() do not require special + * processing, and instead, expect a call to + * put_page_testzero(). + */ + if (put_devmap_managed_page(page)) + continue; } page = compound_head(page); diff --git a/mm/util.c b/mm/util.c index 68575a315dc5..e6351a80f248 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,6 +7,7 @@ #include <linux/err.h> #include <linux/sched.h> #include <linux/sched/mm.h> +#include <linux/sched/signal.h> #include <linux/sched/task_stack.h> #include <linux/security.h> #include <linux/swap.h> @@ -300,6 +301,80 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) } #endif +/** + * __account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * @task: task used to check RLIMIT_MEMLOCK + * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped + * + * Assumes @task and @mm are valid (i.e. at least one reference on each), and + * that mmap_sem is held as writer. + * + * Return: + * * 0 on success + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, + struct task_struct *task, bool bypass_rlim) +{ + unsigned long locked_vm, limit; + int ret = 0; + + lockdep_assert_held_write(&mm->mmap_sem); + + locked_vm = mm->locked_vm; + if (inc) { + if (!bypass_rlim) { + limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked_vm + pages > limit) + ret = -ENOMEM; + } + if (!ret) + mm->locked_vm = locked_vm + pages; + } else { + WARN_ON_ONCE(pages > locked_vm); + mm->locked_vm = locked_vm - pages; + } + + pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid, + (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT, + locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + return ret; +} +EXPORT_SYMBOL_GPL(__account_locked_vm); + +/** + * account_locked_vm - account locked pages to an mm's locked_vm + * @mm: mm to account against, may be NULL + * @pages: number of pages to account + * @inc: %true if @pages should be considered positive, %false if not + * + * Assumes a non-NULL @mm is valid (i.e. at least one reference on it). + * + * Return: + * * 0 on success, or if mm is NULL + * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded. + */ +int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) +{ + int ret; + + if (pages == 0 || !mm) + return 0; + + down_write(&mm->mmap_sem); + ret = __account_locked_vm(mm, pages, inc, current, + capable(CAP_IPC_LOCK)); + up_write(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(account_locked_vm); + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) diff --git a/mm/vmscan.c b/mm/vmscan.c index f8e3dcd527b8..44df66a98f2a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -131,6 +131,9 @@ struct scan_control { unsigned int file_taken; unsigned int taken; } nr; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; }; #ifdef ARCH_HAS_PREFETCH @@ -238,6 +241,18 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) } #endif /* CONFIG_MEMCG_KMEM */ +static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +{ + /* Check for an overwrite */ + WARN_ON_ONCE(rs && task->reclaim_state); + + /* Check for the nulling of an already-nulled member */ + WARN_ON_ONCE(!rs && !task->reclaim_state); + + task->reclaim_state = rs; +} + #ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { @@ -3191,11 +3206,13 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask)) return 1; + set_task_reclaim_state(current, &sc.reclaim_state); trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + set_task_reclaim_state(current, NULL); return nr_reclaimed; } @@ -3218,6 +3235,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, }; unsigned long lru_pages; + set_task_reclaim_state(current, &sc.reclaim_state); sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -3235,7 +3253,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + set_task_reclaim_state(current, NULL); *nr_scanned = sc.nr_scanned; + return sc.nr_reclaimed; } @@ -3262,6 +3282,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_shrinkslab = 1, }; + set_task_reclaim_state(current, &sc.reclaim_state); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -3282,6 +3303,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, psi_memstall_leave(&pflags); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + set_task_reclaim_state(current, NULL); return nr_reclaimed; } @@ -3483,6 +3505,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, }; + set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); __fs_reclaim_acquire(); @@ -3664,6 +3687,8 @@ out: snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); psi_memstall_leave(&pflags); + set_task_reclaim_state(current, NULL); + /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3787,15 +3812,10 @@ static int kswapd(void *p) unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - - struct reclaim_state reclaim_state = { - .reclaimed_slab = 0, - }; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); - current->reclaim_state = &reclaim_state; /* * Tell the memory management that we're a "memory allocator", @@ -3857,7 +3877,6 @@ kswapd_try_sleep: } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); - current->reclaim_state = NULL; return 0; } @@ -3922,7 +3941,6 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, */ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - struct reclaim_state reclaim_state; struct scan_control sc = { .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, @@ -3934,18 +3952,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) .hibernation_mode = 1, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); - struct task_struct *p = current; unsigned long nr_reclaimed; unsigned int noreclaim_flag; fs_reclaim_acquire(sc.gfp_mask); noreclaim_flag = memalloc_noreclaim_save(); - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + set_task_reclaim_state(current, &sc.reclaim_state); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - p->reclaim_state = NULL; + set_task_reclaim_state(current, NULL); memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); @@ -4110,7 +4126,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in /* Minimum pages needed in order to stay on node */ const unsigned long nr_pages = 1 << order; struct task_struct *p = current; - struct reclaim_state reclaim_state; unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), @@ -4135,8 +4150,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + set_task_reclaim_state(p, &sc.reclaim_state); if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) { /* @@ -4148,7 +4162,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - p->reclaim_state = NULL; + set_task_reclaim_state(p, NULL); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); fs_reclaim_release(sc.gfp_mask); diff --git a/mm/z3fold.c b/mm/z3fold.c index dfcd69d08c1e..1a029a7432ee 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -26,7 +26,6 @@ #include <linux/atomic.h> #include <linux/sched.h> #include <linux/cpumask.h> -#include <linux/dcache.h> #include <linux/list.h> #include <linux/mm.h> #include <linux/module.h> @@ -36,12 +35,14 @@ #include <linux/compaction.h> #include <linux/percpu.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/fs.h> #include <linux/preempt.h> #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zpool.h> +#include <linux/magic.h> /* * NCHUNKS_ORDER determines the internal allocation granularity, effectively @@ -101,6 +102,7 @@ struct z3fold_buddy_slots { * @refcount: reference count for the z3fold page * @work: work_struct for page layout optimization * @slots: pointer to the structure holding buddy slots + * @pool: pointer to the containing pool * @cpu: CPU which this page "belongs" to * @first_chunks: the size of the first buddy in chunks, 0 if free * @middle_chunks: the size of the middle buddy in chunks, 0 if free @@ -114,6 +116,7 @@ struct z3fold_header { struct kref refcount; struct work_struct work; struct z3fold_buddy_slots *slots; + struct z3fold_pool *pool; short cpu; unsigned short first_chunks; unsigned short middle_chunks; @@ -193,8 +196,10 @@ static void compact_page_work(struct work_struct *w); static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool, gfp_t gfp) { - struct z3fold_buddy_slots *slots = kmem_cache_alloc(pool->c_handle, - gfp); + struct z3fold_buddy_slots *slots; + + slots = kmem_cache_alloc(pool->c_handle, + (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE))); if (slots) { memset(slots->slot, 0, sizeof(slots->slot)); @@ -241,19 +246,14 @@ static inline void free_handle(unsigned long handle) } } -static struct dentry *z3fold_do_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int z3fold_init_fs_context(struct fs_context *fc) { - static const struct dentry_operations ops = { - .d_dname = simple_dname, - }; - - return mount_pseudo(fs_type, "z3fold:", NULL, &ops, 0x33); + return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type z3fold_fs = { .name = "z3fold", - .mount = z3fold_do_mount, + .init_fs_context = z3fold_init_fs_context, .kill_sb = kill_anon_super, }; @@ -320,6 +320,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, zhdr->start_middle = 0; zhdr->cpu = -1; zhdr->slots = slots; + zhdr->pool = pool; INIT_LIST_HEAD(&zhdr->buddy); INIT_WORK(&zhdr->work, compact_page_work); return zhdr; @@ -426,7 +427,7 @@ static enum buddy handle_to_buddy(unsigned long handle) static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr) { - return slots_to_pool(zhdr->slots); + return zhdr->pool; } static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked) @@ -850,7 +851,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, enum buddy bud; bool can_sleep = gfpflags_allow_blocking(gfp); - if (!size || (gfp & __GFP_HIGHMEM)) + if (!size) return -EINVAL; if (size > PAGE_SIZE) @@ -1345,24 +1346,29 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa zhdr = page_address(page); pool = zhdr_to_pool(zhdr); - if (!trylock_page(page)) - return -EAGAIN; - if (!z3fold_page_trylock(zhdr)) { - unlock_page(page); return -EAGAIN; } if (zhdr->mapped_count != 0) { z3fold_page_unlock(zhdr); - unlock_page(page); return -EBUSY; } + if (work_pending(&zhdr->work)) { + z3fold_page_unlock(zhdr); + return -EAGAIN; + } new_zhdr = page_address(newpage); memcpy(new_zhdr, zhdr, PAGE_SIZE); newpage->private = page->private; page->private = 0; z3fold_page_unlock(zhdr); spin_lock_init(&new_zhdr->page_lock); + INIT_WORK(&new_zhdr->work, compact_page_work); + /* + * z3fold_page_isolate() ensures that new_zhdr->buddy is empty, + * so we only have to reinitialize it. + */ + INIT_LIST_HEAD(&new_zhdr->buddy); new_mapping = page_mapping(page); __ClearPageMovable(page); ClearPagePrivate(page); @@ -1386,7 +1392,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); page_mapcount_reset(page); - unlock_page(page); put_page(page); return 0; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index db09eb3669c5..57fbb7ced69f 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -52,6 +52,7 @@ #include <linux/zsmalloc.h> #include <linux/zpool.h> #include <linux/mount.h> +#include <linux/pseudo_fs.h> #include <linux/migrate.h> #include <linux/pagemap.h> #include <linux/fs.h> @@ -1798,19 +1799,14 @@ static void lock_zspage(struct zspage *zspage) } while ((page = get_next_page(page)) != NULL); } -static struct dentry *zs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int zs_init_fs_context(struct fs_context *fc) { - static const struct dentry_operations ops = { - .d_dname = simple_dname, - }; - - return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC); + return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM; } static struct file_system_type zsmalloc_fs = { .name = "zsmalloc", - .mount = zs_mount, + .init_fs_context = zs_init_fs_context, .kill_sb = kill_anon_super, }; |