summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig2
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/cma.c19
-rw-r--r--mm/compaction.c6
-rw-r--r--mm/filemap.c1
-rw-r--r--mm/gup.c4
-rw-r--r--mm/hugetlb.c23
-rw-r--r--mm/hugetlb_vmemmap.c6
-rw-r--r--mm/internal.h1
-rw-r--r--mm/kasan/kasan_test_c.c21
-rw-r--r--mm/memcontrol-v1.c2
-rw-r--r--mm/memcontrol.c39
-rw-r--r--mm/memory.c10
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mm_init.c1
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c168
-rw-r--r--mm/slub.c10
-rw-r--r--mm/userfaultfd.c13
-rw-r--r--mm/vma.c51
-rw-r--r--mm/vma.h9
-rw-r--r--mm/vmscan.c29
22 files changed, 313 insertions, 116 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d3fb3762887b..e113f713b493 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -201,7 +201,7 @@ config KVFREE_RCU_BATCHED
config SLUB_TINY
bool "Configure for minimal memory footprint"
- depends on EXPERT
+ depends on EXPERT && !COMPILE_TEST
select SLAB_MERGE_DEFAULT
help
Configures the slab allocator in a way to achieve minimal memory
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index e61bbb1bd622..783904d8c5ef 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1151,7 +1151,7 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
void bdi_unregister(struct backing_dev_info *bdi)
{
- del_timer_sync(&bdi->laptop_mode_wb_timer);
+ timer_delete_sync(&bdi->laptop_mode_wb_timer);
/* make sure nobody finds us on the bdi_list anymore */
bdi_remove_from_list(bdi);
diff --git a/mm/cma.c b/mm/cma.c
index b06d5fe73399..15632939f20a 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -35,7 +35,7 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned int cma_area_count;
-static int __init __cma_declare_contiguous_nid(phys_addr_t base,
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, const char *name, struct cma **res_cma,
@@ -370,7 +370,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size,
phys_addr_t align, unsigned int order_per_bit,
const char *name, struct cma **res_cma, int nid)
{
- phys_addr_t start, end;
+ phys_addr_t start = 0, end;
phys_addr_t size, sizesum, sizeleft;
struct cma_init_memrange *mrp, *mlp, *failed;
struct cma_memrange *cmrp;
@@ -384,7 +384,7 @@ int __init cma_declare_contiguous_multi(phys_addr_t total_size,
/*
* First, try it the normal way, producing just one range.
*/
- ret = __cma_declare_contiguous_nid(0, total_size, 0, align,
+ ret = __cma_declare_contiguous_nid(&start, total_size, 0, align,
order_per_bit, false, name, res_cma, nid);
if (ret != -ENOMEM)
goto out;
@@ -580,7 +580,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
{
int ret;
- ret = __cma_declare_contiguous_nid(base, size, limit, alignment,
+ ret = __cma_declare_contiguous_nid(&base, size, limit, alignment,
order_per_bit, fixed, name, res_cma, nid);
if (ret != 0)
pr_err("Failed to reserve %ld MiB\n",
@@ -592,14 +592,14 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return ret;
}
-static int __init __cma_declare_contiguous_nid(phys_addr_t base,
+static int __init __cma_declare_contiguous_nid(phys_addr_t *basep,
phys_addr_t size, phys_addr_t limit,
phys_addr_t alignment, unsigned int order_per_bit,
bool fixed, const char *name, struct cma **res_cma,
int nid)
{
phys_addr_t memblock_end = memblock_end_of_DRAM();
- phys_addr_t highmem_start;
+ phys_addr_t highmem_start, base = *basep;
int ret;
/*
@@ -722,12 +722,15 @@ static int __init __cma_declare_contiguous_nid(phys_addr_t base,
}
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
- if (ret)
+ if (ret) {
memblock_phys_free(base, size);
+ return ret;
+ }
(*res_cma)->nid = nid;
+ *basep = base;
- return ret;
+ return 0;
}
static void cma_debug_show_areas(struct cma *cma)
diff --git a/mm/compaction.c b/mm/compaction.c
index 139f00c0308a..ca71fd3c3181 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -981,13 +981,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
}
if (PageHuge(page)) {
+ const unsigned int order = compound_order(page);
/*
* skip hugetlbfs if we are not compacting for pages
* bigger than its order. THPs and other compound pages
* are handled below.
*/
if (!cc->alloc_contig) {
- const unsigned int order = compound_order(page);
if (order <= MAX_PAGE_ORDER) {
low_pfn += (1UL << order) - 1;
@@ -1011,8 +1011,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/* Do not report -EBUSY down the chain */
if (ret == -EBUSY)
ret = 0;
- low_pfn += compound_nr(page) - 1;
- nr_scanned += compound_nr(page) - 1;
+ low_pfn += (1UL << order) - 1;
+ nr_scanned += (1UL << order) - 1;
goto isolate_fail;
}
diff --git a/mm/filemap.c b/mm/filemap.c
index b5e784f34d98..7b90cbeb4a1a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2244,6 +2244,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping,
*start = folio->index + nr;
goto out;
}
+ xas_advance(&xas, folio_next_index(folio) - 1);
continue;
put_folio:
folio_put(folio);
diff --git a/mm/gup.c b/mm/gup.c
index 92351e2fa876..84461d384ae2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2207,8 +2207,8 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
} while (start != end);
mmap_read_unlock(mm);
- if (size > (unsigned long)uaddr - start)
- return size - ((unsigned long)uaddr - start);
+ if (size > start - (unsigned long)uaddr)
+ return size - (start - (unsigned long)uaddr);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 39f92aad7bd1..e3e6ac991b9c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2271,7 +2271,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
* as surplus_pages, otherwise it might confuse
* persistent_huge_pages() momentarily.
*/
- __prep_account_new_huge_page(h, nid);
+ __prep_account_new_huge_page(h, folio_nid(folio));
/*
* We could have raced with the pool size change.
@@ -3825,6 +3825,7 @@ found:
static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
nodemask_t *nodes_allowed)
{
+ unsigned long persistent_free_count;
unsigned long min_count;
unsigned long allocated;
struct folio *folio;
@@ -3959,8 +3960,24 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
* though, we'll note that we're not allowed to exceed surplus
* and won't grow the pool anywhere else. Not until one of the
* sysctls are changed, or the surplus pages go out of use.
+ *
+ * min_count is the expected number of persistent pages, we
+ * shouldn't calculate min_count by using
+ * resv_huge_pages + persistent_huge_pages() - free_huge_pages,
+ * because there may exist free surplus huge pages, and this will
+ * lead to subtracting twice. Free surplus huge pages come from HVO
+ * failing to restore vmemmap, see comments in the callers of
+ * hugetlb_vmemmap_restore_folio(). Thus, we should calculate
+ * persistent free count first.
*/
- min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
+ persistent_free_count = h->free_huge_pages;
+ if (h->free_huge_pages > persistent_huge_pages(h)) {
+ if (h->free_huge_pages > h->surplus_huge_pages)
+ persistent_free_count -= h->surplus_huge_pages;
+ else
+ persistent_free_count = 0;
+ }
+ min_count = h->resv_huge_pages + persistent_huge_pages(h) - persistent_free_count;
min_count = max(count, min_count);
try_to_free_low(h, min_count, nodes_allowed);
@@ -4630,7 +4647,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("HugeTLB: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s\n", h->name);
}
#ifdef CONFIG_NUMA
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 9a99dfa3c495..27245e86df25 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
* struct page, the special metadata (e.g. page->flags or page->mapping)
* cannot copy to the tail struct page structs. The invalid value will be
* checked in the free_tail_page_prepare(). In order to avoid the message
- * of "corrupted mapping in tail page". We need to reset at least 3 (one
- * head struct page struct and two tail struct page structs) struct page
+ * of "corrupted mapping in tail page". We need to reset at least 4 (one
+ * head struct page struct and three tail struct page structs) struct page
* structs.
*/
-#define NR_RESET_STRUCT_PAGE 3
+#define NR_RESET_STRUCT_PAGE 4
static inline void reset_struct_pages(struct page *start)
{
diff --git a/mm/internal.h b/mm/internal.h
index 50c2f590b2d0..e9695baa5922 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1595,6 +1595,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc);
#ifdef CONFIG_UNACCEPTED_MEMORY
void accept_page(struct page *page);
+void unaccepted_cleanup_work(struct work_struct *work);
#else /* CONFIG_UNACCEPTED_MEMORY */
static inline void accept_page(struct page *page)
{
diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c
index 3ea317837c2d..5f922dd38ffa 100644
--- a/mm/kasan/kasan_test_c.c
+++ b/mm/kasan/kasan_test_c.c
@@ -1567,6 +1567,7 @@ static void kasan_memcmp(struct kunit *test)
static void kasan_strings(struct kunit *test)
{
char *ptr;
+ char *src;
size_t size = 24;
/*
@@ -1578,6 +1579,25 @@ static void kasan_strings(struct kunit *test)
ptr = kmalloc(size, GFP_KERNEL | __GFP_ZERO);
KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr);
+ src = kmalloc(KASAN_GRANULE_SIZE, GFP_KERNEL | __GFP_ZERO);
+ strscpy(src, "f0cacc1a0000000", KASAN_GRANULE_SIZE);
+
+ /*
+ * Make sure that strscpy() does not trigger KASAN if it overreads into
+ * poisoned memory.
+ *
+ * The expected size does not include the terminator '\0'
+ * so it is (KASAN_GRANULE_SIZE - 2) ==
+ * KASAN_GRANULE_SIZE - ("initial removed character" + "\0").
+ */
+ KUNIT_EXPECT_EQ(test, KASAN_GRANULE_SIZE - 2,
+ strscpy(ptr, src + 1, KASAN_GRANULE_SIZE));
+
+ /* strscpy should fail if the first byte is unreadable. */
+ KUNIT_EXPECT_KASAN_FAIL(test, strscpy(ptr, src + KASAN_GRANULE_SIZE,
+ KASAN_GRANULE_SIZE));
+
+ kfree(src);
kfree(ptr);
/*
@@ -2127,4 +2147,5 @@ static struct kunit_suite kasan_kunit_test_suite = {
kunit_test_suite(kasan_kunit_test_suite);
+MODULE_DESCRIPTION("KUnit tests for checking KASAN bug-detection capabilities");
MODULE_LICENSE("GPL");
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 8660908850dc..4a9cf27a70af 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- swap_cgroup_record(folio, mem_cgroup_id(memcg), entry);
+ swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry);
folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 421740f1bcdc..c96c1f2b9cf5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1759,7 +1759,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
struct memcg_stock_pcp {
- localtry_lock_t stock_lock;
+ local_trylock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -1774,7 +1774,7 @@ struct memcg_stock_pcp {
#define FLUSHING_CACHED_CHARGE 0
};
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
- .stock_lock = INIT_LOCALTRY_LOCK(stock_lock),
+ .stock_lock = INIT_LOCAL_TRYLOCK(stock_lock),
};
static DEFINE_MUTEX(percpu_charge_mutex);
@@ -1805,11 +1805,10 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
- if (!gfpflags_allow_spinning(gfp_mask))
- return ret;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
- }
+ if (gfpflags_allow_spinning(gfp_mask))
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
+ else if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags))
+ return ret;
stock = this_cpu_ptr(&memcg_stock);
stock_pages = READ_ONCE(stock->nr_pages);
@@ -1818,7 +1817,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages,
ret = true;
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -1857,14 +1856,14 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -1894,7 +1893,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- if (!localtry_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
+ if (!local_trylock_irqsave(&memcg_stock.stock_lock, flags)) {
/*
* In case of unlikely failure to lock percpu stock_lock
* uncharge memcg directly.
@@ -1907,7 +1906,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
return;
}
__refill_stock(memcg, nr_pages);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -1964,9 +1963,9 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
stock = &per_cpu(memcg_stock, cpu);
/* drain_obj_stock requires stock_lock */
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
old = drain_obj_stock(stock);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
drain_stock(stock);
obj_cgroup_put(old);
@@ -2787,7 +2786,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
unsigned long flags;
int *bytes;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@@ -2836,7 +2835,7 @@ static void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
__mod_objcg_mlstate(objcg, pgdat, idx, nr);
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
}
@@ -2846,7 +2845,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == READ_ONCE(stock->cached_objcg) && stock->nr_bytes >= nr_bytes) {
@@ -2854,7 +2853,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2946,7 +2945,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
unsigned long flags;
unsigned int nr_pages = 0;
- localtry_lock_irqsave(&memcg_stock.stock_lock, flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (READ_ONCE(stock->cached_objcg) != objcg) { /* reset if necessary */
@@ -2960,7 +2959,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- localtry_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
obj_cgroup_put(old);
if (nr_pages)
diff --git a/mm/memory.c b/mm/memory.c
index 2d8c265fc7d6..ba3ea0a82f7f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1361,7 +1361,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
struct mm_struct *dst_mm = dst_vma->vm_mm;
struct mm_struct *src_mm = src_vma->vm_mm;
struct mmu_notifier_range range;
- unsigned long next, pfn;
+ unsigned long next, pfn = 0;
bool is_cow;
int ret;
@@ -2938,11 +2938,11 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
if (fn) {
do {
if (create || !pte_none(ptep_get(pte))) {
- err = fn(pte++, addr, data);
+ err = fn(pte, addr, data);
if (err)
break;
}
- } while (addr += PAGE_SIZE, addr != end);
+ } while (pte++, addr += PAGE_SIZE, addr != end);
}
*mask |= PGTBL_PTE_MODIFIED;
@@ -3734,8 +3734,6 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
return false;
VM_WARN_ON_ONCE(folio_test_ksm(folio));
- VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio));
- VM_WARN_ON_ONCE(folio_entire_mapcount(folio));
if (unlikely(folio_test_swapcache(folio))) {
/*
@@ -3760,6 +3758,8 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio,
if (folio_large_mapcount(folio) != folio_ref_count(folio))
goto unlock;
+ VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio);
+ VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio);
VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id &&
folio_mm_id(folio, 1) != vma->vm_mm->mm_id);
diff --git a/mm/migrate.c b/mm/migrate.c
index f3ee6d8d5e2e..676d9cfc7059 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -845,9 +845,11 @@ static int __buffer_migrate_folio(struct address_space *mapping,
return -EAGAIN;
if (check_refs) {
- bool busy;
+ bool busy, migrating;
bool invalidated = false;
+ migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state);
+ VM_WARN_ON_ONCE(migrating);
recheck_buffers:
busy = false;
spin_lock(&mapping->i_private_lock);
@@ -859,12 +861,12 @@ recheck_buffers:
}
bh = bh->b_this_page;
} while (bh != head);
+ spin_unlock(&mapping->i_private_lock);
if (busy) {
if (invalidated) {
rc = -EAGAIN;
goto unlock_buffers;
}
- spin_unlock(&mapping->i_private_lock);
invalidate_bh_lrus();
invalidated = true;
goto recheck_buffers;
@@ -883,7 +885,7 @@ recheck_buffers:
unlock_buffers:
if (check_refs)
- spin_unlock(&mapping->i_private_lock);
+ clear_bit_unlock(BH_Migrate, &head->b_state);
bh = head;
do {
unlock_buffer(bh);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 84f14fa12d0d..9659689b8ace 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1441,6 +1441,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
#ifdef CONFIG_UNACCEPTED_MEMORY
INIT_LIST_HEAD(&zone->unaccepted_pages);
+ INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work);
#endif
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 18456ddd463b..c81624bc3969 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -640,7 +640,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
#ifdef CONFIG_CGROUP_WRITEBACK
void wb_domain_exit(struct wb_domain *dom)
{
- del_timer_sync(&dom->period_timer);
+ timer_delete_sync(&dom->period_timer);
fprop_global_destroy(&dom->completions);
}
#endif
@@ -2229,7 +2229,7 @@ void laptop_sync_completion(void)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
- del_timer(&bdi->laptop_mode_wb_timer);
+ timer_delete(&bdi->laptop_mode_wb_timer);
rcu_read_unlock();
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd6b865cb1ab..5669baf2a6fe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1400,11 +1400,12 @@ static void free_one_page(struct zone *zone, struct page *page,
struct llist_head *llhead;
unsigned long flags;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (unlikely(fpi_flags & FPI_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags)) {
add_page_to_zone_llist(zone, page, order);
return;
}
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
@@ -2182,23 +2183,15 @@ try_to_claim_block(struct zone *zone, struct page *page,
}
/*
- * Try finding a free buddy page on the fallback list.
- *
- * This will attempt to claim a whole pageblock for the requested type
- * to ensure grouping of such requests in the future.
- *
- * If a whole block cannot be claimed, steal an individual page, regressing to
- * __rmqueue_smallest() logic to at least break up as little contiguity as
- * possible.
+ * Try to allocate from some fallback migratetype by claiming the entire block,
+ * i.e. converting it to the allocation's start migratetype.
*
* The use of signed ints for order and current_order is a deliberate
* deviation from the rest of this file, to make the for loop
* condition simpler.
- *
- * Return the stolen page, or NULL if none can be found.
*/
static __always_inline struct page *
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+__rmqueue_claim(struct zone *zone, int order, int start_migratetype,
unsigned int alloc_flags)
{
struct free_area *area;
@@ -2236,14 +2229,29 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
page = try_to_claim_block(zone, page, current_order, order,
start_migratetype, fallback_mt,
alloc_flags);
- if (page)
- goto got_one;
+ if (page) {
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
+ }
}
- if (alloc_flags & ALLOC_NOFRAGMENT)
- return NULL;
+ return NULL;
+}
+
+/*
+ * Try to steal a single page from some fallback migratetype. Leave the rest of
+ * the block as its current migratetype, potentially causing fragmentation.
+ */
+static __always_inline struct page *
+__rmqueue_steal(struct zone *zone, int order, int start_migratetype)
+{
+ struct free_area *area;
+ int current_order;
+ struct page *page;
+ int fallback_mt;
+ bool claim_block;
- /* No luck claiming pageblock. Find the smallest fallback page */
for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2253,25 +2261,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
page = get_page_from_free_area(area, fallback_mt);
page_del_and_expand(zone, page, order, current_order, fallback_mt);
- goto got_one;
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, fallback_mt);
+ return page;
}
return NULL;
-
-got_one:
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, fallback_mt);
-
- return page;
}
+enum rmqueue_mode {
+ RMQUEUE_NORMAL,
+ RMQUEUE_CMA,
+ RMQUEUE_CLAIM,
+ RMQUEUE_STEAL,
+};
+
/*
* Do the hard work of removing an element from the buddy allocator.
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
__rmqueue(struct zone *zone, unsigned int order, int migratetype,
- unsigned int alloc_flags)
+ unsigned int alloc_flags, enum rmqueue_mode *mode)
{
struct page *page;
@@ -2290,16 +2301,48 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
}
}
- page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (alloc_flags & ALLOC_CMA)
+ /*
+ * First try the freelists of the requested migratetype, then try
+ * fallbacks modes with increasing levels of fragmentation risk.
+ *
+ * The fallback logic is expensive and rmqueue_bulk() calls in
+ * a loop with the zone->lock held, meaning the freelists are
+ * not subject to any outside changes. Remember in *mode where
+ * we found pay dirt, to save us the search on the next call.
+ */
+ switch (*mode) {
+ case RMQUEUE_NORMAL:
+ page = __rmqueue_smallest(zone, order, migratetype);
+ if (page)
+ return page;
+ fallthrough;
+ case RMQUEUE_CMA:
+ if (alloc_flags & ALLOC_CMA) {
page = __rmqueue_cma_fallback(zone, order);
-
- if (!page)
- page = __rmqueue_fallback(zone, order, migratetype,
- alloc_flags);
+ if (page) {
+ *mode = RMQUEUE_CMA;
+ return page;
+ }
+ }
+ fallthrough;
+ case RMQUEUE_CLAIM:
+ page = __rmqueue_claim(zone, order, migratetype, alloc_flags);
+ if (page) {
+ /* Replenished preferred freelist, back to normal mode. */
+ *mode = RMQUEUE_NORMAL;
+ return page;
+ }
+ fallthrough;
+ case RMQUEUE_STEAL:
+ if (!(alloc_flags & ALLOC_NOFRAGMENT)) {
+ page = __rmqueue_steal(zone, order, migratetype);
+ if (page) {
+ *mode = RMQUEUE_STEAL;
+ return page;
+ }
+ }
}
- return page;
+ return NULL;
}
/*
@@ -2311,17 +2354,19 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
int migratetype, unsigned int alloc_flags)
{
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
unsigned long flags;
int i;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
return 0;
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
- alloc_flags);
+ alloc_flags, &rmqm);
if (unlikely(page == NULL))
break;
@@ -2937,15 +2982,18 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
do {
page = NULL;
- if (!spin_trylock_irqsave(&zone->lock, flags)) {
- if (unlikely(alloc_flags & ALLOC_TRYLOCK))
+ if (unlikely(alloc_flags & ALLOC_TRYLOCK)) {
+ if (!spin_trylock_irqsave(&zone->lock, flags))
return NULL;
+ } else {
spin_lock_irqsave(&zone->lock, flags);
}
if (alloc_flags & ALLOC_HIGHATOMIC)
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
+ enum rmqueue_mode rmqm = RMQUEUE_NORMAL;
+
+ page = __rmqueue(zone, order, migratetype, alloc_flags, &rmqm);
/*
* If the allocation fails, allow OOM handling and
@@ -3422,18 +3470,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
return false;
}
-bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int highest_zoneidx)
-{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
-
- if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
- free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
-
- return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
- free_pages);
-}
-
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
@@ -7143,6 +7179,11 @@ static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages);
static bool lazy_accept = true;
+void unaccepted_cleanup_work(struct work_struct *work)
+{
+ static_branch_dec(&zones_with_unaccepted_pages);
+}
+
static int __init accept_memory_parse(char *p)
{
if (!strcmp(p, "lazy")) {
@@ -7181,8 +7222,27 @@ static void __accept_page(struct zone *zone, unsigned long *flags,
__free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
+ if (last) {
+ /*
+ * There are two corner cases:
+ *
+ * - If allocation occurs during the CPU bring up,
+ * static_branch_dec() cannot be used directly as
+ * it causes a deadlock on cpu_hotplug_lock.
+ *
+ * Instead, use schedule_work() to prevent deadlock.
+ *
+ * - If allocation occurs before workqueues are initialized,
+ * static_branch_dec() should be called directly.
+ *
+ * Workqueues are initialized before CPU bring up, so this
+ * will not conflict with the first scenario.
+ */
+ if (system_wq)
+ schedule_work(&zone->unaccepted_cleanup);
+ else
+ unaccepted_cleanup_work(&zone->unaccepted_cleanup);
+ }
}
void accept_page(struct page *page)
diff --git a/mm/slub.c b/mm/slub.c
index b46f87662e71..dc9e729e1d26 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1973,6 +1973,11 @@ static inline void handle_failed_objexts_alloc(unsigned long obj_exts,
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | \
__GFP_ACCOUNT | __GFP_NOFAIL)
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+ slab->obj_exts = 0;
+}
+
int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2058,6 +2063,10 @@ static inline bool need_slab_obj_ext(void)
#else /* CONFIG_SLAB_OBJ_EXT */
+static inline void init_slab_obj_exts(struct slab *slab)
+{
+}
+
static int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab)
{
@@ -2637,6 +2646,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
+ init_slab_obj_exts(slab);
account_slab(slab, oo_order(oo), s, flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index fbf2cf62ab9f..7d5d709cc838 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1902,6 +1902,14 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
unsigned long end)
{
struct vm_area_struct *ret;
+ bool give_up_on_oom = false;
+
+ /*
+ * If we are modifying only and not splitting, just give up on the merge
+ * if OOM prevents us from merging successfully.
+ */
+ if (start == vma->vm_start && end == vma->vm_end)
+ give_up_on_oom = true;
/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
@@ -1909,7 +1917,7 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
- NULL_VM_UFFD_CTX);
+ NULL_VM_UFFD_CTX, give_up_on_oom);
/*
* In the vma_merge() successful mprotect-like case 8:
@@ -1960,7 +1968,8 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
new_flags,
- (struct vm_userfaultfd_ctx){ctx});
+ (struct vm_userfaultfd_ctx){ctx},
+ /* give_up_on_oom = */false);
if (IS_ERR(vma))
return PTR_ERR(vma);
diff --git a/mm/vma.c b/mm/vma.c
index 5cdc5612bfc1..839d12f02c88 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -666,6 +666,9 @@ static void vmg_adjust_set_range(struct vma_merge_struct *vmg)
/*
* Actually perform the VMA merge operation.
*
+ * IMPORTANT: We guarantee that, should vmg->give_up_on_oom is set, to not
+ * modify any VMAs or cause inconsistent state should an OOM condition arise.
+ *
* Returns 0 on success, or an error value on failure.
*/
static int commit_merge(struct vma_merge_struct *vmg)
@@ -685,6 +688,12 @@ static int commit_merge(struct vma_merge_struct *vmg)
init_multi_vma_prep(&vp, vma, vmg);
+ /*
+ * If vmg->give_up_on_oom is set, we're safe, because we don't actually
+ * manipulate any VMAs until we succeed at preallocation.
+ *
+ * Past this point, we will not return an error.
+ */
if (vma_iter_prealloc(vmg->vmi, vma))
return -ENOMEM;
@@ -915,7 +924,13 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
if (anon_dup)
unlink_anon_vmas(anon_dup);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+ /*
+ * We've cleaned up any cloned anon_vma's, no VMAs have been
+ * modified, no harm no foul if the user requests that we not
+ * report this and just give up, leaving the VMAs unmerged.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -926,7 +941,15 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
abort:
vma_iter_set(vmg->vmi, start);
vma_iter_load(vmg->vmi);
- vmg->state = VMA_MERGE_ERROR_NOMEM;
+
+ /*
+ * This means we have failed to clone anon_vma's correctly, but no
+ * actual changes to VMAs have occurred, so no harm no foul - if the
+ * user doesn't want this reported and instead just wants to give up on
+ * the merge, allow it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return NULL;
}
@@ -1068,6 +1091,10 @@ int vma_expand(struct vma_merge_struct *vmg)
/* This should already have been checked by this point. */
VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg);
vma_start_write(next);
+ /*
+ * In this case we don't report OOM, so vmg->give_up_on_mm is
+ * safe.
+ */
ret = dup_anon_vma(middle, next, &anon_dup);
if (ret)
return ret;
@@ -1090,9 +1117,15 @@ int vma_expand(struct vma_merge_struct *vmg)
return 0;
nomem:
- vmg->state = VMA_MERGE_ERROR_NOMEM;
if (anon_dup)
unlink_anon_vmas(anon_dup);
+ /*
+ * If the user requests that we just give upon OOM, we are safe to do so
+ * here, as commit merge provides this contract to us. Nothing has been
+ * changed - no harm no foul, just don't report it.
+ */
+ if (!vmg->give_up_on_oom)
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
return -ENOMEM;
}
@@ -1534,6 +1567,13 @@ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
if (vmg_nomem(vmg))
return ERR_PTR(-ENOMEM);
+ /*
+ * Split can fail for reasons other than OOM, so if the user requests
+ * this it's probably a mistake.
+ */
+ VM_WARN_ON(vmg->give_up_on_oom &&
+ (vma->vm_start != start || vma->vm_end != end));
+
/* Split any preceding portion of the VMA. */
if (vma->vm_start < start) {
int err = split_vma(vmg->vmi, vma, start, 1);
@@ -1602,12 +1642,15 @@ struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx)
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom)
{
VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
vmg.flags = new_flags;
vmg.uffd_ctx = new_ctx;
+ if (give_up_on_oom)
+ vmg.give_up_on_oom = true;
return vma_modify(&vmg);
}
diff --git a/mm/vma.h b/mm/vma.h
index 7356ca5a22d3..149926e8a6d1 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -114,6 +114,12 @@ struct vma_merge_struct {
*/
bool just_expand :1;
+ /*
+ * If a merge is possible, but an OOM error occurs, give up and don't
+ * execute the merge, returning NULL.
+ */
+ bool give_up_on_oom :1;
+
/* Internal flags set during merge process: */
/*
@@ -255,7 +261,8 @@ __must_check struct vm_area_struct
struct vm_area_struct *vma,
unsigned long start, unsigned long end,
unsigned long new_flags,
- struct vm_userfaultfd_ctx new_ctx);
+ struct vm_userfaultfd_ctx new_ctx,
+ bool give_up_on_oom);
__must_check struct vm_area_struct
*vma_merge_new_range(struct vma_merge_struct *vmg);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b620d74b0f66..3783e45bfc92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -6736,6 +6736,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
* meet watermarks.
*/
for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) {
+ enum zone_stat_item item;
unsigned long free_pages;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
@@ -6746,11 +6747,33 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
/*
* In defrag_mode, watermarks must be met in whole
* blocks to avoid polluting allocator fallbacks.
+ *
+ * However, kswapd usually cannot accomplish this on
+ * its own and needs kcompactd support. Once it's
+ * reclaimed a compaction gap, and kswapd_shrink_node
+ * has dropped order, simply ensure there are enough
+ * base pages for compaction, wake kcompactd & sleep.
*/
- if (defrag_mode)
- free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS);
+ if (defrag_mode && order)
+ item = NR_FREE_PAGES_BLOCKS;
else
- free_pages = zone_page_state(zone, NR_FREE_PAGES);
+ item = NR_FREE_PAGES;
+
+ /*
+ * When there is a high number of CPUs in the system,
+ * the cumulative error from the vmstat per-cpu cache
+ * can blur the line between the watermarks. In that
+ * case, be safe and get an accurate snapshot.
+ *
+ * TODO: NR_FREE_PAGES_BLOCKS moves in steps of
+ * pageblock_nr_pages, while the vmstat pcp threshold
+ * is limited to 125. On many configurations that
+ * counter won't actually be per-cpu cached. But keep
+ * things simple for now; revisit when somebody cares.
+ */
+ free_pages = zone_page_state(zone, item);
+ if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark)
+ free_pages = zone_page_state_snapshot(zone, item);
if (__zone_watermark_ok(zone, order, mark, highest_zoneidx,
0, free_pages))