summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig17
-rw-r--r--mm/Makefile2
-rw-r--r--mm/backing-dev.c84
-rw-r--r--mm/cma.c26
-rw-r--r--mm/compaction.c14
-rw-r--r--mm/damon/Kconfig24
-rw-r--r--mm/damon/Makefile4
-rw-r--r--mm/damon/core-test.h4
-rw-r--r--mm/damon/core.c446
-rw-r--r--mm/damon/dbgfs-test.h54
-rw-r--r--mm/damon/dbgfs.c430
-rw-r--r--mm/damon/paddr.c273
-rw-r--r--mm/damon/prmtv-common.c133
-rw-r--r--mm/damon/prmtv-common.h20
-rw-r--r--mm/damon/reclaim.c356
-rw-r--r--mm/damon/vaddr-test.h2
-rw-r--r--mm/damon/vaddr.c167
-rw-r--r--mm/debug.c26
-rw-r--r--mm/debug_vm_pgtable.c7
-rw-r--r--mm/filemap.c658
-rw-r--r--mm/folio-compat.c142
-rw-r--r--mm/gup.c144
-rw-r--r--mm/highmem.c7
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/hugetlb.c701
-rw-r--r--mm/hugetlb_cgroup.c3
-rw-r--r--mm/internal.h58
-rw-r--r--mm/kasan/common.c8
-rw-r--r--mm/kasan/generic.c14
-rw-r--r--mm/kasan/hw_tags.c29
-rw-r--r--mm/kasan/kasan.h34
-rw-r--r--mm/kasan/report.c19
-rw-r--r--mm/kasan/shadow.c5
-rw-r--r--mm/kfence/core.c200
-rw-r--r--mm/kfence/kfence.h2
-rw-r--r--mm/kfence/kfence_test.c14
-rw-r--r--mm/khugepaged.c44
-rw-r--r--mm/ksm.c34
-rw-r--r--mm/list_lru.c58
-rw-r--r--mm/memblock.c49
-rw-r--r--mm/memcontrol.c569
-rw-r--r--mm/memfd.c4
-rw-r--r--mm/memory-failure.c149
-rw-r--r--mm/memory.c193
-rw-r--r--mm/memory_hotplug.c53
-rw-r--r--mm/mempolicy.c169
-rw-r--r--mm/mempool.c1
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c312
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mprotect.c5
-rw-r--r--mm/mremap.c86
-rw-r--r--mm/nommu.c7
-rw-r--r--mm/oom_kill.c54
-rw-r--r--mm/page-writeback.c487
-rw-r--r--mm/page_alloc.c141
-rw-r--r--mm/page_ext.c6
-rw-r--r--mm/page_io.c14
-rw-r--r--mm/page_isolation.c29
-rw-r--r--mm/page_owner.c32
-rw-r--r--mm/percpu.c8
-rw-r--r--mm/readahead.c3
-rw-r--r--mm/rmap.c22
-rw-r--r--mm/secretmem.c11
-rw-r--r--mm/shmem.c52
-rw-r--r--mm/slab.c20
-rw-r--r--mm/slab_common.c8
-rw-r--r--mm/slub.c144
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c220
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/truncate.c19
-rw-r--r--mm/userfaultfd.c10
-rw-r--r--mm/util.c111
-rw-r--r--mm/vmalloc.c112
-rw-r--r--mm/vmpressure.c2
-rw-r--r--mm/vmscan.c184
-rw-r--r--mm/vmstat.c76
-rw-r--r--mm/workingset.c62
-rw-r--r--mm/zsmalloc.c7
83 files changed, 5405 insertions, 2336 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d16ba9249bc5..068ce591a13a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -109,6 +109,13 @@ config NUMA_KEEP_MEMINFO
config MEMORY_ISOLATION
bool
+# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked
+# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via
+# /dev/mem.
+config EXCLUSIVE_SYSTEM_RAM
+ def_bool y
+ depends on !DEVMEM || STRICT_DEVMEM
+
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
@@ -123,15 +130,11 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
select MEMORY_ISOLATION
- depends on SPARSEMEM || X86_64_ACPI_NUMA
+ depends on SPARSEMEM
depends on ARCH_ENABLE_MEMORY_HOTPLUG
- depends on 64BIT || BROKEN
+ depends on 64BIT
select NUMA_KEEP_MEMINFO if NUMA
-config MEMORY_HOTPLUG_SPARSE
- def_bool y
- depends on SPARSEMEM && MEMORY_HOTPLUG
-
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
depends on MEMORY_HOTPLUG
@@ -371,7 +374,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
+ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
help
diff --git a/mm/Makefile b/mm/Makefile
index fc60a40ce954..d6c0042e3aa0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -46,7 +46,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page-writeback.o \
+ maccess.o page-writeback.o folio-compat.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 4a9d4e27d0d9..1eead4761011 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -2,8 +2,9 @@
#include <linux/wait.h>
#include <linux/rbtree.h>
-#include <linux/backing-dev.h>
#include <linux/kthread.h>
+#include <linux/backing-dev.h>
+#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@@ -291,8 +292,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
- if (wb != &bdi->wb)
- bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@@ -316,7 +315,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
- goto out_put_bdi;
+ return err;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp);
@@ -330,9 +329,6 @@ out_destroy_stat:
while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
-out_put_bdi:
- if (wb != &bdi->wb)
- bdi_put(bdi);
return err;
}
@@ -373,8 +369,6 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
- if (wb != &wb->bdi->wb)
- bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -397,6 +391,7 @@ static void cgwb_release_workfn(struct work_struct *work)
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
+ struct backing_dev_info *bdi = wb->bdi;
mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb);
@@ -416,6 +411,7 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
+ bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
}
@@ -497,6 +493,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->b_attached);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
+ bdi_get(bdi);
/*
* The root wb determines the registered state of the whole bdi and
@@ -528,6 +525,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
goto out_put;
err_fprop_exit:
+ bdi_put(bdi);
fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
percpu_ref_exit(&wb->refcnt);
@@ -958,14 +956,14 @@ void bdi_unregister(struct backing_dev_info *bdi)
bdi->owner = NULL;
}
}
+EXPORT_SYMBOL(bdi_unregister);
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
- if (test_bit(WB_registered, &bdi->wb.state))
- bdi_unregister(bdi);
+ WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
kfree(bdi);
@@ -977,6 +975,22 @@ void bdi_put(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_put);
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
+{
+ struct super_block *sb;
+
+ if (!inode)
+ return &noop_backing_dev_info;
+
+ sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+ if (sb_is_blkdev_sb(sb))
+ return I_BDEV(inode)->bd_disk->bdi;
+#endif
+ return sb->s_bdi;
+}
+EXPORT_SYMBOL(inode_to_bdi);
+
const char *bdi_dev_name(struct backing_dev_info *bdi)
{
if (!bdi || !bdi->dev)
@@ -1041,51 +1055,3 @@ long congestion_wait(int sync, long timeout)
return ret;
}
EXPORT_SYMBOL(congestion_wait);
-
-/**
- * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
- * @sync: SYNC or ASYNC IO
- * @timeout: timeout in jiffies
- *
- * In the event of a congested backing_dev (any backing_dev) this waits
- * for up to @timeout jiffies for either a BDI to exit congestion of the
- * given @sync queue or a write to complete.
- *
- * The return value is 0 if the sleep is for the full timeout. Otherwise,
- * it is the number of jiffies that were still remaining when the function
- * returned. return_value == timeout implies the function did not sleep.
- */
-long wait_iff_congested(int sync, long timeout)
-{
- long ret;
- unsigned long start = jiffies;
- DEFINE_WAIT(wait);
- wait_queue_head_t *wqh = &congestion_wqh[sync];
-
- /*
- * If there is no congestion, yield if necessary instead
- * of sleeping on the congestion queue
- */
- if (atomic_read(&nr_wb_congested[sync]) == 0) {
- cond_resched();
-
- /* In case we scheduled, work out time remaining */
- ret = timeout - (jiffies - start);
- if (ret < 0)
- ret = 0;
-
- goto out;
- }
-
- /* Sleep until uncongested or a write happens */
- prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
- ret = io_schedule_timeout(timeout);
- finish_wait(wqh, &wait);
-
-out:
- trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
- jiffies_to_usecs(jiffies - start));
-
- return ret;
-}
-EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/cma.c b/mm/cma.c
index 995e15480937..bc9ca8f3c487 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -378,7 +378,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return 0;
free_mem:
- memblock_free(base, size);
+ memblock_phys_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
@@ -524,6 +524,25 @@ out:
return page;
}
+bool cma_pages_valid(struct cma *cma, const struct page *pages,
+ unsigned long count)
+{
+ unsigned long pfn;
+
+ if (!cma || !pages)
+ return false;
+
+ pfn = page_to_pfn(pages);
+
+ if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
+ pr_debug("%s(page %p, count %lu)\n", __func__,
+ (void *)pages, count);
+ return false;
+ }
+
+ return true;
+}
+
/**
* cma_release() - release allocated pages
* @cma: Contiguous memory region for which the allocation is performed.
@@ -539,16 +558,13 @@ bool cma_release(struct cma *cma, const struct page *pages,
{
unsigned long pfn;
- if (!cma || !pages)
+ if (!cma_pages_valid(cma, pages, count))
return false;
pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
pfn = page_to_pfn(pages);
- if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
- return false;
-
VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
free_contig_range(pfn, count);
diff --git a/mm/compaction.c b/mm/compaction.c
index bfc93da1c2c7..6e446094ce90 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -761,6 +761,8 @@ isolate_freepages_range(struct compact_control *cc,
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(pg_data_t *pgdat)
{
+ bool too_many;
+
unsigned long active, inactive, isolated;
inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
@@ -770,7 +772,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
node_page_state(pgdat, NR_ISOLATED_ANON);
- return isolated > (inactive + active) / 2;
+ too_many = isolated > (inactive + active) / 2;
+ if (!too_many)
+ wake_throttle_isolated(pgdat);
+
+ return too_many;
}
/**
@@ -822,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (cc->mode == MIGRATE_ASYNC)
return -EAGAIN;
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
if (fatal_signal_pending(current))
return -EINTR;
@@ -1022,7 +1028,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
- lruvec = mem_cgroup_page_lruvec(page);
+ lruvec = folio_lruvec(page_folio(page));
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
@@ -1032,7 +1038,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
- lruvec_memcg_debug(lruvec, page);
+ lruvec_memcg_debug(lruvec, page_folio(page));
/* Try get exclusive access under lock */
if (!skip_updated) {
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 37024798a97c..5bcf05851ad0 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -30,7 +30,15 @@ config DAMON_VADDR
select PAGE_IDLE_FLAG
help
This builds the default data access monitoring primitives for DAMON
- that works for virtual address spaces.
+ that work for virtual address spaces.
+
+config DAMON_PADDR
+ bool "Data access monitoring primitives for the physical address space"
+ depends on DAMON && MMU
+ select PAGE_IDLE_FLAG
+ help
+ This builds the default data access monitoring primitives for DAMON
+ that works for the physical address space.
config DAMON_VADDR_KUNIT_TEST
bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
@@ -46,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_DBGFS
bool "DAMON debugfs interface"
- depends on DAMON_VADDR && DEBUG_FS
+ depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
help
This builds the debugfs interface for DAMON. The user space admins
can use the interface for arbitrary data access monitoring.
@@ -65,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST
If unsure, say N.
+config DAMON_RECLAIM
+ bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based reclamation subsystem. It finds pages
+ that not accessed for a long time (cold) using DAMON and reclaim
+ those.
+
+ This is suggested to be used as a proactive and lightweight
+ reclamation under light memory pressure, while the traditional page
+ scanning-based reclamation is used for heavy pressure.
+
endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index fed4be3bace3..f7d5ac377a2b 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -1,5 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DAMON) := core.o
-obj-$(CONFIG_DAMON_VADDR) += vaddr.o
+obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o
+obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
+obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h
index c938a9c34e6c..7008c3735e99 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/core-test.h
@@ -219,14 +219,14 @@ static void damon_test_split_regions_of(struct kunit *test)
r = damon_new_region(0, 22);
damon_add_region(r, t);
damon_split_regions_of(c, t, 2);
- KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2u);
+ KUNIT_EXPECT_LE(test, damon_nr_regions(t), 2u);
damon_free_target(t);
t = damon_new_target(42);
r = damon_new_region(0, 220);
damon_add_region(r, t);
damon_split_regions_of(c, t, 4);
- KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 4u);
+ KUNIT_EXPECT_LE(test, damon_nr_regions(t), 4u);
damon_free_target(t);
damon_destroy_ctx(c);
}
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 30e9211f494a..c381b3c525d0 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -10,8 +10,10 @@
#include <linux/damon.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/mm.h>
#include <linux/random.h>
#include <linux/slab.h>
+#include <linux/string.h>
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@@ -45,6 +47,9 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
region->nr_accesses = 0;
INIT_LIST_HEAD(&region->list);
+ region->age = 0;
+ region->last_nr_accesses = 0;
+
return region;
}
@@ -82,6 +87,74 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
damon_free_region(r);
}
+struct damos *damon_new_scheme(
+ unsigned long min_sz_region, unsigned long max_sz_region,
+ unsigned int min_nr_accesses, unsigned int max_nr_accesses,
+ unsigned int min_age_region, unsigned int max_age_region,
+ enum damos_action action, struct damos_quota *quota,
+ struct damos_watermarks *wmarks)
+{
+ struct damos *scheme;
+
+ scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
+ if (!scheme)
+ return NULL;
+ scheme->min_sz_region = min_sz_region;
+ scheme->max_sz_region = max_sz_region;
+ scheme->min_nr_accesses = min_nr_accesses;
+ scheme->max_nr_accesses = max_nr_accesses;
+ scheme->min_age_region = min_age_region;
+ scheme->max_age_region = max_age_region;
+ scheme->action = action;
+ scheme->stat_count = 0;
+ scheme->stat_sz = 0;
+ INIT_LIST_HEAD(&scheme->list);
+
+ scheme->quota.ms = quota->ms;
+ scheme->quota.sz = quota->sz;
+ scheme->quota.reset_interval = quota->reset_interval;
+ scheme->quota.weight_sz = quota->weight_sz;
+ scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
+ scheme->quota.weight_age = quota->weight_age;
+ scheme->quota.total_charged_sz = 0;
+ scheme->quota.total_charged_ns = 0;
+ scheme->quota.esz = 0;
+ scheme->quota.charged_sz = 0;
+ scheme->quota.charged_from = 0;
+ scheme->quota.charge_target_from = NULL;
+ scheme->quota.charge_addr_from = 0;
+
+ scheme->wmarks.metric = wmarks->metric;
+ scheme->wmarks.interval = wmarks->interval;
+ scheme->wmarks.high = wmarks->high;
+ scheme->wmarks.mid = wmarks->mid;
+ scheme->wmarks.low = wmarks->low;
+ scheme->wmarks.activated = true;
+
+ return scheme;
+}
+
+void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
+{
+ list_add_tail(&s->list, &ctx->schemes);
+}
+
+static void damon_del_scheme(struct damos *s)
+{
+ list_del(&s->list);
+}
+
+static void damon_free_scheme(struct damos *s)
+{
+ kfree(s);
+}
+
+void damon_destroy_scheme(struct damos *s)
+{
+ damon_del_scheme(s);
+ damon_free_scheme(s);
+}
+
/*
* Construct a damon_target struct
*
@@ -107,6 +180,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
list_add_tail(&t->list, &ctx->adaptive_targets);
}
+bool damon_targets_empty(struct damon_ctx *ctx)
+{
+ return list_empty(&ctx->adaptive_targets);
+}
+
static void damon_del_target(struct damon_target *t)
{
list_del(&t->list);
@@ -153,6 +231,7 @@ struct damon_ctx *damon_new_ctx(void)
ctx->max_nr_regions = 1000;
INIT_LIST_HEAD(&ctx->adaptive_targets);
+ INIT_LIST_HEAD(&ctx->schemes);
return ctx;
}
@@ -172,7 +251,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
void damon_destroy_ctx(struct damon_ctx *ctx)
{
+ struct damos *s, *next_s;
+
damon_destroy_targets(ctx);
+
+ damon_for_each_scheme_safe(s, next_s, ctx)
+ damon_destroy_scheme(s);
+
kfree(ctx);
}
@@ -248,6 +333,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
}
/**
+ * damon_set_schemes() - Set data access monitoring based operation schemes.
+ * @ctx: monitoring context
+ * @schemes: array of the schemes
+ * @nr_schemes: number of entries in @schemes
+ *
+ * This function should not be called while the kdamond of the context is
+ * running.
+ *
+ * Return: 0 if success, or negative error code otherwise.
+ */
+int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
+ ssize_t nr_schemes)
+{
+ struct damos *s, *next;
+ ssize_t i;
+
+ damon_for_each_scheme_safe(s, next, ctx)
+ damon_destroy_scheme(s);
+ for (i = 0; i < nr_schemes; i++)
+ damon_add_scheme(ctx, schemes[i]);
+ return 0;
+}
+
+/**
* damon_nr_running_ctxs() - Return number of currently running contexts.
*/
int damon_nr_running_ctxs(void)
@@ -281,17 +390,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
return sz;
}
-static bool damon_kdamond_running(struct damon_ctx *ctx)
-{
- bool running;
-
- mutex_lock(&ctx->kdamond_lock);
- running = ctx->kdamond != NULL;
- mutex_unlock(&ctx->kdamond_lock);
-
- return running;
-}
-
static int kdamond_fn(void *data);
/*
@@ -309,12 +407,11 @@ static int __damon_start(struct damon_ctx *ctx)
mutex_lock(&ctx->kdamond_lock);
if (!ctx->kdamond) {
err = 0;
- ctx->kdamond_stop = false;
ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
nr_running_ctxs);
if (IS_ERR(ctx->kdamond)) {
err = PTR_ERR(ctx->kdamond);
- ctx->kdamond = 0;
+ ctx->kdamond = NULL;
}
}
mutex_unlock(&ctx->kdamond_lock);
@@ -365,13 +462,15 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
*/
static int __damon_stop(struct damon_ctx *ctx)
{
+ struct task_struct *tsk;
+
mutex_lock(&ctx->kdamond_lock);
- if (ctx->kdamond) {
- ctx->kdamond_stop = true;
+ tsk = ctx->kdamond;
+ if (tsk) {
+ get_task_struct(tsk);
mutex_unlock(&ctx->kdamond_lock);
- while (damon_kdamond_running(ctx))
- usleep_range(ctx->sample_interval,
- ctx->sample_interval * 2);
+ kthread_stop(tsk);
+ put_task_struct(tsk);
return 0;
}
mutex_unlock(&ctx->kdamond_lock);
@@ -444,11 +543,203 @@ static void kdamond_reset_aggregated(struct damon_ctx *c)
damon_for_each_region(r, t) {
trace_damon_aggregated(t, r, damon_nr_regions(t));
+ r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
}
}
+static void damon_split_region_at(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ unsigned long sz_r);
+
+static bool __damos_valid_target(struct damon_region *r, struct damos *s)
+{
+ unsigned long sz;
+
+ sz = r->ar.end - r->ar.start;
+ return s->min_sz_region <= sz && sz <= s->max_sz_region &&
+ s->min_nr_accesses <= r->nr_accesses &&
+ r->nr_accesses <= s->max_nr_accesses &&
+ s->min_age_region <= r->age && r->age <= s->max_age_region;
+}
+
+static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
+ struct damon_region *r, struct damos *s)
+{
+ bool ret = __damos_valid_target(r, s);
+
+ if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
+ return ret;
+
+ return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
+}
+
+static void damon_do_apply_schemes(struct damon_ctx *c,
+ struct damon_target *t,
+ struct damon_region *r)
+{
+ struct damos *s;
+
+ damon_for_each_scheme(s, c) {
+ struct damos_quota *quota = &s->quota;
+ unsigned long sz = r->ar.end - r->ar.start;
+ struct timespec64 begin, end;
+
+ if (!s->wmarks.activated)
+ continue;
+
+ /* Check the quota */
+ if (quota->esz && quota->charged_sz >= quota->esz)
+ continue;
+
+ /* Skip previously charged regions */
+ if (quota->charge_target_from) {
+ if (t != quota->charge_target_from)
+ continue;
+ if (r == damon_last_region(t)) {
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ continue;
+ }
+ if (quota->charge_addr_from &&
+ r->ar.end <= quota->charge_addr_from)
+ continue;
+
+ if (quota->charge_addr_from && r->ar.start <
+ quota->charge_addr_from) {
+ sz = ALIGN_DOWN(quota->charge_addr_from -
+ r->ar.start, DAMON_MIN_REGION);
+ if (!sz) {
+ if (r->ar.end - r->ar.start <=
+ DAMON_MIN_REGION)
+ continue;
+ sz = DAMON_MIN_REGION;
+ }
+ damon_split_region_at(c, t, r, sz);
+ r = damon_next_region(r);
+ sz = r->ar.end - r->ar.start;
+ }
+ quota->charge_target_from = NULL;
+ quota->charge_addr_from = 0;
+ }
+
+ if (!damos_valid_target(c, t, r, s))
+ continue;
+
+ /* Apply the scheme */
+ if (c->primitive.apply_scheme) {
+ if (quota->esz &&
+ quota->charged_sz + sz > quota->esz) {
+ sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
+ DAMON_MIN_REGION);
+ if (!sz)
+ goto update_stat;
+ damon_split_region_at(c, t, r, sz);
+ }
+ ktime_get_coarse_ts64(&begin);
+ c->primitive.apply_scheme(c, t, r, s);
+ ktime_get_coarse_ts64(&end);
+ quota->total_charged_ns += timespec64_to_ns(&end) -
+ timespec64_to_ns(&begin);
+ quota->charged_sz += sz;
+ if (quota->esz && quota->charged_sz >= quota->esz) {
+ quota->charge_target_from = t;
+ quota->charge_addr_from = r->ar.end + 1;
+ }
+ }
+ if (s->action != DAMOS_STAT)
+ r->age = 0;
+
+update_stat:
+ s->stat_count++;
+ s->stat_sz += sz;
+ }
+}
+
+/* Shouldn't be called if quota->ms and quota->sz are zero */
+static void damos_set_effective_quota(struct damos_quota *quota)
+{
+ unsigned long throughput;
+ unsigned long esz;
+
+ if (!quota->ms) {
+ quota->esz = quota->sz;
+ return;
+ }
+
+ if (quota->total_charged_ns)
+ throughput = quota->total_charged_sz * 1000000 /
+ quota->total_charged_ns;
+ else
+ throughput = PAGE_SIZE * 1024;
+ esz = throughput * quota->ms;
+
+ if (quota->sz && quota->sz < esz)
+ esz = quota->sz;
+ quota->esz = esz;
+}
+
+static void kdamond_apply_schemes(struct damon_ctx *c)
+{
+ struct damon_target *t;
+ struct damon_region *r, *next_r;
+ struct damos *s;
+
+ damon_for_each_scheme(s, c) {
+ struct damos_quota *quota = &s->quota;
+ unsigned long cumulated_sz;
+ unsigned int score, max_score = 0;
+
+ if (!s->wmarks.activated)
+ continue;
+
+ if (!quota->ms && !quota->sz)
+ continue;
+
+ /* New charge window starts */
+ if (time_after_eq(jiffies, quota->charged_from +
+ msecs_to_jiffies(
+ quota->reset_interval))) {
+ quota->total_charged_sz += quota->charged_sz;
+ quota->charged_from = jiffies;
+ quota->charged_sz = 0;
+ damos_set_effective_quota(quota);
+ }
+
+ if (!c->primitive.get_scheme_score)
+ continue;
+
+ /* Fill up the score histogram */
+ memset(quota->histogram, 0, sizeof(quota->histogram));
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ if (!__damos_valid_target(r, s))
+ continue;
+ score = c->primitive.get_scheme_score(
+ c, t, r, s);
+ quota->histogram[score] +=
+ r->ar.end - r->ar.start;
+ if (score > max_score)
+ max_score = score;
+ }
+ }
+
+ /* Set the min score limit */
+ for (cumulated_sz = 0, score = max_score; ; score--) {
+ cumulated_sz += quota->histogram[score];
+ if (cumulated_sz >= quota->esz || !score)
+ break;
+ }
+ quota->min_score = score;
+ }
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region_safe(r, next_r, t)
+ damon_do_apply_schemes(c, t, r);
+ }
+}
+
#define sz_damon_region(r) (r->ar.end - r->ar.start)
/*
@@ -461,6 +752,7 @@ static void damon_merge_two_regions(struct damon_target *t,
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
+ l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
l->ar.end = r->ar.end;
damon_destroy_region(r, t);
}
@@ -480,6 +772,11 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
struct damon_region *r, *prev = NULL, *next;
damon_for_each_region_safe(r, next, t) {
+ if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres)
+ r->age = 0;
+ else
+ r->age++;
+
if (prev && prev->ar.end == r->ar.start &&
diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
@@ -527,6 +824,9 @@ static void damon_split_region_at(struct damon_ctx *ctx,
r->ar.end = new->ar.start;
+ new->age = r->age;
+ new->last_nr_accesses = r->last_nr_accesses;
+
damon_insert_region(new, r, damon_next_region(r), t);
}
@@ -615,12 +915,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
static bool kdamond_need_stop(struct damon_ctx *ctx)
{
struct damon_target *t;
- bool stop;
- mutex_lock(&ctx->kdamond_lock);
- stop = ctx->kdamond_stop;
- mutex_unlock(&ctx->kdamond_lock);
- if (stop)
+ if (kthread_should_stop())
return true;
if (!ctx->primitive.target_valid)
@@ -634,11 +930,81 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
return true;
}
-static void set_kdamond_stop(struct damon_ctx *ctx)
+static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
{
- mutex_lock(&ctx->kdamond_lock);
- ctx->kdamond_stop = true;
- mutex_unlock(&ctx->kdamond_lock);
+ struct sysinfo i;
+
+ switch (metric) {
+ case DAMOS_WMARK_FREE_MEM_RATE:
+ si_meminfo(&i);
+ return i.freeram * 1000 / i.totalram;
+ default:
+ break;
+ }
+ return -EINVAL;
+}
+
+/*
+ * Returns zero if the scheme is active. Else, returns time to wait for next
+ * watermark check in micro-seconds.
+ */
+static unsigned long damos_wmark_wait_us(struct damos *scheme)
+{
+ unsigned long metric;
+
+ if (scheme->wmarks.metric == DAMOS_WMARK_NONE)
+ return 0;
+
+ metric = damos_wmark_metric_value(scheme->wmarks.metric);
+ /* higher than high watermark or lower than low watermark */
+ if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
+ if (scheme->wmarks.activated)
+ pr_debug("deactivate a scheme (%d) for %s wmark\n",
+ scheme->action,
+ metric > scheme->wmarks.high ?
+ "high" : "low");
+ scheme->wmarks.activated = false;
+ return scheme->wmarks.interval;
+ }
+
+ /* inactive and higher than middle watermark */
+ if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) &&
+ !scheme->wmarks.activated)
+ return scheme->wmarks.interval;
+
+ if (!scheme->wmarks.activated)
+ pr_debug("activate a scheme (%d)\n", scheme->action);
+ scheme->wmarks.activated = true;
+ return 0;
+}
+
+static void kdamond_usleep(unsigned long usecs)
+{
+ if (usecs > 100 * 1000)
+ schedule_timeout_interruptible(usecs_to_jiffies(usecs));
+ else
+ usleep_range(usecs, usecs + 1);
+}
+
+/* Returns negative error code if it's not activated but should return */
+static int kdamond_wait_activation(struct damon_ctx *ctx)
+{
+ struct damos *s;
+ unsigned long wait_time;
+ unsigned long min_wait_time = 0;
+
+ while (!kdamond_need_stop(ctx)) {
+ damon_for_each_scheme(s, ctx) {
+ wait_time = damos_wmark_wait_us(s);
+ if (!min_wait_time || wait_time < min_wait_time)
+ min_wait_time = wait_time;
+ }
+ if (!min_wait_time)
+ return 0;
+
+ kdamond_usleep(min_wait_time);
+ }
+ return -EBUSY;
}
/*
@@ -651,24 +1017,26 @@ static int kdamond_fn(void *data)
struct damon_region *r, *next;
unsigned int max_nr_accesses = 0;
unsigned long sz_limit = 0;
+ bool done = false;
- mutex_lock(&ctx->kdamond_lock);
- pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
- mutex_unlock(&ctx->kdamond_lock);
+ pr_debug("kdamond (%d) starts\n", current->pid);
if (ctx->primitive.init)
ctx->primitive.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
- set_kdamond_stop(ctx);
+ done = true;
sz_limit = damon_region_sz_limit(ctx);
- while (!kdamond_need_stop(ctx)) {
+ while (!kdamond_need_stop(ctx) && !done) {
+ if (kdamond_wait_activation(ctx))
+ continue;
+
if (ctx->primitive.prepare_access_checks)
ctx->primitive.prepare_access_checks(ctx);
if (ctx->callback.after_sampling &&
ctx->callback.after_sampling(ctx))
- set_kdamond_stop(ctx);
+ done = true;
usleep_range(ctx->sample_interval, ctx->sample_interval + 1);
@@ -681,7 +1049,8 @@ static int kdamond_fn(void *data)
sz_limit);
if (ctx->callback.after_aggregation &&
ctx->callback.after_aggregation(ctx))
- set_kdamond_stop(ctx);
+ done = true;
+ kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
if (ctx->primitive.reset_aggregated)
@@ -699,13 +1068,12 @@ static int kdamond_fn(void *data)
damon_destroy_region(r, t);
}
- if (ctx->callback.before_terminate &&
- ctx->callback.before_terminate(ctx))
- set_kdamond_stop(ctx);
+ if (ctx->callback.before_terminate)
+ ctx->callback.before_terminate(ctx);
if (ctx->primitive.cleanup)
ctx->primitive.cleanup(ctx);
- pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
+ pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
@@ -714,7 +1082,7 @@ static int kdamond_fn(void *data)
nr_running_ctxs--;
mutex_unlock(&damon_lock);
- do_exit(0);
+ return 0;
}
#include "core-test.h"
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/dbgfs-test.h
index 4eddcfa73996..86b9f9528231 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/dbgfs-test.h
@@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
dbgfs_destroy_ctx(ctx);
}
+static void damon_dbgfs_test_set_init_regions(struct kunit *test)
+{
+ struct damon_ctx *ctx = damon_new_ctx();
+ unsigned long ids[] = {1, 2, 3};
+ /* Each line represents one region in ``<target id> <start> <end>`` */
+ char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45",
+ "2 10 20\n",
+ "2 10 20\n1 39 59\n1 70 134\n 2 20 25\n",
+ ""};
+ /* Reading the file again will show sorted, clean output */
+ char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
+ "2 10 20\n",
+ "1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
+ ""};
+ char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */
+ "2 10 20\n 2 14 26\n", /* regions overlap */
+ "1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */
+ char *input, *expect;
+ int i, rc;
+ char buf[256];
+
+ damon_set_targets(ctx, ids, 3);
+
+ /* Put valid inputs and check the results */
+ for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
+ input = valid_inputs[i];
+ expect = valid_expects[i];
+
+ rc = set_init_regions(ctx, input, strnlen(input, 256));
+ KUNIT_EXPECT_EQ(test, rc, 0);
+
+ memset(buf, 0, 256);
+ sprint_init_regions(ctx, buf, 256);
+
+ KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
+ }
+ /* Put invalid inputs and check the return error code */
+ for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
+ input = invalid_inputs[i];
+ pr_info("input: %s\n", input);
+ rc = set_init_regions(ctx, input, strnlen(input, 256));
+ KUNIT_EXPECT_EQ(test, rc, -EINVAL);
+
+ memset(buf, 0, 256);
+ sprint_init_regions(ctx, buf, 256);
+
+ KUNIT_EXPECT_STREQ(test, (char *)buf, "");
+ }
+
+ damon_set_targets(ctx, NULL, 0);
+ damon_destroy_ctx(ctx);
+}
+
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
KUNIT_CASE(damon_dbgfs_test_set_targets),
+ KUNIT_CASE(damon_dbgfs_test_set_init_regions),
{},
};
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index faee070977d8..eccc14b34901 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
struct damon_ctx *ctx = file->private_data;
unsigned long s, a, r, minr, maxr;
char *kbuf;
- ssize_t ret = count;
- int err;
+ ssize_t ret;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@@ -88,11 +87,182 @@ static ssize_t dbgfs_attrs_write(struct file *file,
goto unlock_out;
}
- err = damon_set_attrs(ctx, s, a, r, minr, maxr);
- if (err)
- ret = err;
+ ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
+ if (!ret)
+ ret = count;
+unlock_out:
+ mutex_unlock(&ctx->kdamond_lock);
+out:
+ kfree(kbuf);
+ return ret;
+}
+
+static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
+{
+ struct damos *s;
+ int written = 0;
+ int rc;
+
+ damon_for_each_scheme(s, c) {
+ rc = scnprintf(&buf[written], len - written,
+ "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n",
+ s->min_sz_region, s->max_sz_region,
+ s->min_nr_accesses, s->max_nr_accesses,
+ s->min_age_region, s->max_age_region,
+ s->action,
+ s->quota.ms, s->quota.sz,
+ s->quota.reset_interval,
+ s->quota.weight_sz,
+ s->quota.weight_nr_accesses,
+ s->quota.weight_age,
+ s->wmarks.metric, s->wmarks.interval,
+ s->wmarks.high, s->wmarks.mid, s->wmarks.low,
+ s->stat_count, s->stat_sz);
+ if (!rc)
+ return -ENOMEM;
+
+ written += rc;
+ }
+ return written;
+}
+
+static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t len;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mutex_lock(&ctx->kdamond_lock);
+ len = sprint_schemes(ctx, kbuf, count);
+ mutex_unlock(&ctx->kdamond_lock);
+ if (len < 0)
+ goto out;
+ len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+ kfree(kbuf);
+ return len;
+}
+
+static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
+{
+ ssize_t i;
+
+ for (i = 0; i < nr_schemes; i++)
+ kfree(schemes[i]);
+ kfree(schemes);
+}
+
+static bool damos_action_valid(int action)
+{
+ switch (action) {
+ case DAMOS_WILLNEED:
+ case DAMOS_COLD:
+ case DAMOS_PAGEOUT:
+ case DAMOS_HUGEPAGE:
+ case DAMOS_NOHUGEPAGE:
+ case DAMOS_STAT:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/*
+ * Converts a string into an array of struct damos pointers
+ *
+ * Returns an array of struct damos pointers that converted if the conversion
+ * success, or NULL otherwise.
+ */
+static struct damos **str_to_schemes(const char *str, ssize_t len,
+ ssize_t *nr_schemes)
+{
+ struct damos *scheme, **schemes;
+ const int max_nr_schemes = 256;
+ int pos = 0, parsed, ret;
+ unsigned long min_sz, max_sz;
+ unsigned int min_nr_a, max_nr_a, min_age, max_age;
+ unsigned int action;
+
+ schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
+ GFP_KERNEL);
+ if (!schemes)
+ return NULL;
+
+ *nr_schemes = 0;
+ while (pos < len && *nr_schemes < max_nr_schemes) {
+ struct damos_quota quota = {};
+ struct damos_watermarks wmarks;
+
+ ret = sscanf(&str[pos],
+ "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
+ &min_sz, &max_sz, &min_nr_a, &max_nr_a,
+ &min_age, &max_age, &action, &quota.ms,
+ &quota.sz, &quota.reset_interval,
+ &quota.weight_sz, &quota.weight_nr_accesses,
+ &quota.weight_age, &wmarks.metric,
+ &wmarks.interval, &wmarks.high, &wmarks.mid,
+ &wmarks.low, &parsed);
+ if (ret != 18)
+ break;
+ if (!damos_action_valid(action)) {
+ pr_err("wrong action %d\n", action);
+ goto fail;
+ }
+
+ pos += parsed;
+ scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
+ min_age, max_age, action, &quota, &wmarks);
+ if (!scheme)
+ goto fail;
+
+ schemes[*nr_schemes] = scheme;
+ *nr_schemes += 1;
+ }
+ return schemes;
+fail:
+ free_schemes_arr(schemes, *nr_schemes);
+ return NULL;
+}
+
+static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ struct damos **schemes;
+ ssize_t nr_schemes = 0, ret;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ schemes = str_to_schemes(kbuf, count, &nr_schemes);
+ if (!schemes) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ ret = -EBUSY;
+ goto unlock_out;
+ }
+
+ ret = damon_set_schemes(ctx, schemes, nr_schemes);
+ if (!ret) {
+ ret = count;
+ nr_schemes = 0;
+ }
+
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
+ free_schemes_arr(schemes, nr_schemes);
out:
kfree(kbuf);
return ret;
@@ -185,26 +355,31 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
+ bool id_is_pid = true;
char *kbuf, *nrs;
unsigned long *targets;
ssize_t nr_targets;
- ssize_t ret = count;
+ ssize_t ret;
int i;
- int err;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
nrs = kbuf;
+ if (!strncmp(kbuf, "paddr\n", count)) {
+ id_is_pid = false;
+ /* target id is meaningless here, but we set it just for fun */
+ scnprintf(kbuf, count, "42 ");
+ }
- targets = str_to_target_ids(nrs, ret, &nr_targets);
+ targets = str_to_target_ids(nrs, count, &nr_targets);
if (!targets) {
ret = -ENOMEM;
goto out;
}
- if (targetid_is_pid(ctx)) {
+ if (id_is_pid) {
for (i = 0; i < nr_targets; i++) {
targets[i] = (unsigned long)find_get_pid(
(int)targets[i]);
@@ -218,17 +393,27 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
- if (targetid_is_pid(ctx))
+ if (id_is_pid)
dbgfs_put_pids(targets, nr_targets);
ret = -EBUSY;
goto unlock_out;
}
- err = damon_set_targets(ctx, targets, nr_targets);
- if (err) {
- if (targetid_is_pid(ctx))
+ /* remove targets with previously-set primitive */
+ damon_set_targets(ctx, NULL, 0);
+
+ /* Configure the context for the address space type */
+ if (id_is_pid)
+ damon_va_set_primitives(ctx);
+ else
+ damon_pa_set_primitives(ctx);
+
+ ret = damon_set_targets(ctx, targets, nr_targets);
+ if (ret) {
+ if (id_is_pid)
dbgfs_put_pids(targets, nr_targets);
- ret = err;
+ } else {
+ ret = count;
}
unlock_out:
@@ -240,6 +425,152 @@ out:
return ret;
}
+static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ int written = 0;
+ int rc;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region(r, t) {
+ rc = scnprintf(&buf[written], len - written,
+ "%lu %lu %lu\n",
+ t->id, r->ar.start, r->ar.end);
+ if (!rc)
+ return -ENOMEM;
+ written += rc;
+ }
+ }
+ return written;
+}
+
+static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t len;
+
+ kbuf = kmalloc(count, GFP_KERNEL);
+ if (!kbuf)
+ return -ENOMEM;
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ mutex_unlock(&ctx->kdamond_lock);
+ len = -EBUSY;
+ goto out;
+ }
+
+ len = sprint_init_regions(ctx, kbuf, count);
+ mutex_unlock(&ctx->kdamond_lock);
+ if (len < 0)
+ goto out;
+ len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
+
+out:
+ kfree(kbuf);
+ return len;
+}
+
+static int add_init_region(struct damon_ctx *c,
+ unsigned long target_id, struct damon_addr_range *ar)
+{
+ struct damon_target *t;
+ struct damon_region *r, *prev;
+ unsigned long id;
+ int rc = -EINVAL;
+
+ if (ar->start >= ar->end)
+ return -EINVAL;
+
+ damon_for_each_target(t, c) {
+ id = t->id;
+ if (targetid_is_pid(c))
+ id = (unsigned long)pid_vnr((struct pid *)id);
+ if (id == target_id) {
+ r = damon_new_region(ar->start, ar->end);
+ if (!r)
+ return -ENOMEM;
+ damon_add_region(r, t);
+ if (damon_nr_regions(t) > 1) {
+ prev = damon_prev_region(r);
+ if (prev->ar.end > r->ar.start) {
+ damon_destroy_region(r, t);
+ return -EINVAL;
+ }
+ }
+ rc = 0;
+ }
+ }
+ return rc;
+}
+
+static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
+{
+ struct damon_target *t;
+ struct damon_region *r, *next;
+ int pos = 0, parsed, ret;
+ unsigned long target_id;
+ struct damon_addr_range ar;
+ int err;
+
+ damon_for_each_target(t, c) {
+ damon_for_each_region_safe(r, next, t)
+ damon_destroy_region(r, t);
+ }
+
+ while (pos < len) {
+ ret = sscanf(&str[pos], "%lu %lu %lu%n",
+ &target_id, &ar.start, &ar.end, &parsed);
+ if (ret != 3)
+ break;
+ err = add_init_region(c, target_id, &ar);
+ if (err)
+ goto fail;
+ pos += parsed;
+ }
+
+ return 0;
+
+fail:
+ damon_for_each_target(t, c) {
+ damon_for_each_region_safe(r, next, t)
+ damon_destroy_region(r, t);
+ }
+ return err;
+}
+
+static ssize_t dbgfs_init_regions_write(struct file *file,
+ const char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct damon_ctx *ctx = file->private_data;
+ char *kbuf;
+ ssize_t ret = count;
+ int err;
+
+ kbuf = user_input_str(buf, count, ppos);
+ if (IS_ERR(kbuf))
+ return PTR_ERR(kbuf);
+
+ mutex_lock(&ctx->kdamond_lock);
+ if (ctx->kdamond) {
+ ret = -EBUSY;
+ goto unlock_out;
+ }
+
+ err = set_init_regions(ctx, kbuf, ret);
+ if (err)
+ ret = err;
+
+unlock_out:
+ mutex_unlock(&ctx->kdamond_lock);
+ kfree(kbuf);
+ return ret;
+}
+
static ssize_t dbgfs_kdamond_pid_read(struct file *file,
char __user *buf, size_t count, loff_t *ppos)
{
@@ -279,12 +610,24 @@ static const struct file_operations attrs_fops = {
.write = dbgfs_attrs_write,
};
+static const struct file_operations schemes_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_schemes_read,
+ .write = dbgfs_schemes_write,
+};
+
static const struct file_operations target_ids_fops = {
.open = damon_dbgfs_open,
.read = dbgfs_target_ids_read,
.write = dbgfs_target_ids_write,
};
+static const struct file_operations init_regions_fops = {
+ .open = damon_dbgfs_open,
+ .read = dbgfs_init_regions_read,
+ .write = dbgfs_init_regions_write,
+};
+
static const struct file_operations kdamond_pid_fops = {
.open = damon_dbgfs_open,
.read = dbgfs_kdamond_pid_read,
@@ -292,28 +635,27 @@ static const struct file_operations kdamond_pid_fops = {
static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
{
- const char * const file_names[] = {"attrs", "target_ids",
- "kdamond_pid"};
- const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops,
- &kdamond_pid_fops};
+ const char * const file_names[] = {"attrs", "schemes", "target_ids",
+ "init_regions", "kdamond_pid"};
+ const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
+ &target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
int i;
for (i = 0; i < ARRAY_SIZE(file_names); i++)
debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
}
-static int dbgfs_before_terminate(struct damon_ctx *ctx)
+static void dbgfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
if (!targetid_is_pid(ctx))
- return 0;
+ return;
damon_for_each_target_safe(t, next, ctx) {
put_pid((struct pid *)t->id);
damon_destroy_target(t);
}
- return 0;
}
static struct damon_ctx *dbgfs_new_ctx(void)
@@ -388,8 +730,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
{
char *kbuf;
char *ctx_name;
- ssize_t ret = count;
- int err;
+ ssize_t ret;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@@ -407,9 +748,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
}
mutex_lock(&damon_dbgfs_lock);
- err = dbgfs_mk_context(ctx_name);
- if (err)
- ret = err;
+ ret = dbgfs_mk_context(ctx_name);
+ if (!ret)
+ ret = count;
mutex_unlock(&damon_dbgfs_lock);
out:
@@ -478,8 +819,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
char *kbuf;
- ssize_t ret = count;
- int err;
+ ssize_t ret;
char *ctx_name;
kbuf = user_input_str(buf, count, ppos);
@@ -498,9 +838,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
}
mutex_lock(&damon_dbgfs_lock);
- err = dbgfs_rm_context(ctx_name);
- if (err)
- ret = err;
+ ret = dbgfs_rm_context(ctx_name);
+ if (!ret)
+ ret = count;
mutex_unlock(&damon_dbgfs_lock);
out:
@@ -524,9 +864,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file,
static ssize_t dbgfs_monitor_on_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
- ssize_t ret = count;
+ ssize_t ret;
char *kbuf;
- int err;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@@ -538,15 +877,24 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
return -EINVAL;
}
- if (!strncmp(kbuf, "on", count))
- err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
- else if (!strncmp(kbuf, "off", count))
- err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
- else
- err = -EINVAL;
+ if (!strncmp(kbuf, "on", count)) {
+ int i;
- if (err)
- ret = err;
+ for (i = 0; i < dbgfs_nr_ctxs; i++) {
+ if (damon_targets_empty(dbgfs_ctxs[i])) {
+ kfree(kbuf);
+ return -EINVAL;
+ }
+ }
+ ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
+ } else if (!strncmp(kbuf, "off", count)) {
+ ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
+ } else {
+ ret = -EINVAL;
+ }
+
+ if (!ret)
+ ret = count;
kfree(kbuf);
return ret;
}
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
new file mode 100644
index 000000000000..a496d6f203d6
--- /dev/null
+++ b/mm/damon/paddr.c
@@ -0,0 +1,273 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON Primitives for The Physical Address Space
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-pa: " fmt
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+
+#include "../internal.h"
+#include "prmtv-common.h"
+
+static bool __damon_pa_mkold(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = addr,
+ };
+
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte)
+ damon_ptep_mkold(pvmw.pte, vma->vm_mm, addr);
+ else
+ damon_pmdp_mkold(pvmw.pmd, vma->vm_mm, addr);
+ }
+ return true;
+}
+
+static void damon_pa_mkold(unsigned long paddr)
+{
+ struct page *page = damon_get_page(PHYS_PFN(paddr));
+ struct rmap_walk_control rwc = {
+ .rmap_one = __damon_pa_mkold,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!page)
+ return;
+
+ if (!page_mapped(page) || !page_rmapping(page)) {
+ set_page_idle(page);
+ goto out;
+ }
+
+ need_lock = !PageAnon(page) || PageKsm(page);
+ if (need_lock && !trylock_page(page))
+ goto out;
+
+ rmap_walk(page, &rwc);
+
+ if (need_lock)
+ unlock_page(page);
+
+out:
+ put_page(page);
+}
+
+static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
+ struct damon_region *r)
+{
+ r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
+
+ damon_pa_mkold(r->sampling_addr);
+}
+
+void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region(r, t)
+ __damon_pa_prepare_access_check(ctx, r);
+ }
+}
+
+struct damon_pa_access_chk_result {
+ unsigned long page_sz;
+ bool accessed;
+};
+
+static bool __damon_pa_young(struct page *page, struct vm_area_struct *vma,
+ unsigned long addr, void *arg)
+{
+ struct damon_pa_access_chk_result *result = arg;
+ struct page_vma_mapped_walk pvmw = {
+ .page = page,
+ .vma = vma,
+ .address = addr,
+ };
+
+ result->accessed = false;
+ result->page_sz = PAGE_SIZE;
+ while (page_vma_mapped_walk(&pvmw)) {
+ addr = pvmw.address;
+ if (pvmw.pte) {
+ result->accessed = pte_young(*pvmw.pte) ||
+ !page_is_idle(page) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+ } else {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ result->accessed = pmd_young(*pvmw.pmd) ||
+ !page_is_idle(page) ||
+ mmu_notifier_test_young(vma->vm_mm, addr);
+ result->page_sz = ((1UL) << HPAGE_PMD_SHIFT);
+#else
+ WARN_ON_ONCE(1);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ }
+ if (result->accessed) {
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ }
+
+ /* If accessed, stop walking */
+ return !result->accessed;
+}
+
+static bool damon_pa_young(unsigned long paddr, unsigned long *page_sz)
+{
+ struct page *page = damon_get_page(PHYS_PFN(paddr));
+ struct damon_pa_access_chk_result result = {
+ .page_sz = PAGE_SIZE,
+ .accessed = false,
+ };
+ struct rmap_walk_control rwc = {
+ .arg = &result,
+ .rmap_one = __damon_pa_young,
+ .anon_lock = page_lock_anon_vma_read,
+ };
+ bool need_lock;
+
+ if (!page)
+ return false;
+
+ if (!page_mapped(page) || !page_rmapping(page)) {
+ if (page_is_idle(page))
+ result.accessed = false;
+ else
+ result.accessed = true;
+ put_page(page);
+ goto out;
+ }
+
+ need_lock = !PageAnon(page) || PageKsm(page);
+ if (need_lock && !trylock_page(page)) {
+ put_page(page);
+ return NULL;
+ }
+
+ rmap_walk(page, &rwc);
+
+ if (need_lock)
+ unlock_page(page);
+ put_page(page);
+
+out:
+ *page_sz = result.page_sz;
+ return result.accessed;
+}
+
+static void __damon_pa_check_access(struct damon_ctx *ctx,
+ struct damon_region *r)
+{
+ static unsigned long last_addr;
+ static unsigned long last_page_sz = PAGE_SIZE;
+ static bool last_accessed;
+
+ /* If the region is in the last checked page, reuse the result */
+ if (ALIGN_DOWN(last_addr, last_page_sz) ==
+ ALIGN_DOWN(r->sampling_addr, last_page_sz)) {
+ if (last_accessed)
+ r->nr_accesses++;
+ return;
+ }
+
+ last_accessed = damon_pa_young(r->sampling_addr, &last_page_sz);
+ if (last_accessed)
+ r->nr_accesses++;
+
+ last_addr = r->sampling_addr;
+}
+
+unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
+{
+ struct damon_target *t;
+ struct damon_region *r;
+ unsigned int max_nr_accesses = 0;
+
+ damon_for_each_target(t, ctx) {
+ damon_for_each_region(r, t) {
+ __damon_pa_check_access(ctx, r);
+ max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
+ }
+ }
+
+ return max_nr_accesses;
+}
+
+bool damon_pa_target_valid(void *t)
+{
+ return true;
+}
+
+int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *scheme)
+{
+ unsigned long addr;
+ LIST_HEAD(page_list);
+
+ if (scheme->action != DAMOS_PAGEOUT)
+ return -EINVAL;
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct page *page = damon_get_page(PHYS_PFN(addr));
+
+ if (!page)
+ continue;
+
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (isolate_lru_page(page)) {
+ put_page(page);
+ continue;
+ }
+ if (PageUnevictable(page)) {
+ putback_lru_page(page);
+ } else {
+ list_add(&page->lru, &page_list);
+ put_page(page);
+ }
+ }
+ reclaim_pages(&page_list);
+ cond_resched();
+ return 0;
+}
+
+int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
+ struct damon_region *r, struct damos *scheme)
+{
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_pageout_score(context, r, scheme);
+ default:
+ break;
+ }
+
+ return DAMOS_MAX_SCORE;
+}
+
+void damon_pa_set_primitives(struct damon_ctx *ctx)
+{
+ ctx->primitive.init = NULL;
+ ctx->primitive.update = NULL;
+ ctx->primitive.prepare_access_checks = damon_pa_prepare_access_checks;
+ ctx->primitive.check_accesses = damon_pa_check_accesses;
+ ctx->primitive.reset_aggregated = NULL;
+ ctx->primitive.target_valid = damon_pa_target_valid;
+ ctx->primitive.cleanup = NULL;
+ ctx->primitive.apply_scheme = damon_pa_apply_scheme;
+ ctx->primitive.get_scheme_score = damon_pa_scheme_score;
+}
diff --git a/mm/damon/prmtv-common.c b/mm/damon/prmtv-common.c
new file mode 100644
index 000000000000..92a04f5831d6
--- /dev/null
+++ b/mm/damon/prmtv-common.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+
+#include "prmtv-common.h"
+
+/*
+ * Get an online page for a pfn if it's in the LRU list. Otherwise, returns
+ * NULL.
+ *
+ * The body of this function is stolen from the 'page_idle_get_page()'. We
+ * steal rather than reuse it because the code is quite simple.
+ */
+struct page *damon_get_page(unsigned long pfn)
+{
+ struct page *page = pfn_to_online_page(pfn);
+
+ if (!page || !PageLRU(page) || !get_page_unless_zero(page))
+ return NULL;
+
+ if (unlikely(!PageLRU(page))) {
+ put_page(page);
+ page = NULL;
+ }
+ return page;
+}
+
+void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr)
+{
+ bool referenced = false;
+ struct page *page = damon_get_page(pte_pfn(*pte));
+
+ if (!page)
+ return;
+
+ if (pte_young(*pte)) {
+ referenced = true;
+ *pte = pte_mkold(*pte);
+ }
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
+ referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+ if (referenced)
+ set_page_young(page);
+
+ set_page_idle(page);
+ put_page(page);
+}
+
+void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ bool referenced = false;
+ struct page *page = damon_get_page(pmd_pfn(*pmd));
+
+ if (!page)
+ return;
+
+ if (pmd_young(*pmd)) {
+ referenced = true;
+ *pmd = pmd_mkold(*pmd);
+ }
+
+#ifdef CONFIG_MMU_NOTIFIER
+ if (mmu_notifier_clear_young(mm, addr,
+ addr + ((1UL) << HPAGE_PMD_SHIFT)))
+ referenced = true;
+#endif /* CONFIG_MMU_NOTIFIER */
+
+ if (referenced)
+ set_page_young(page);
+
+ set_page_idle(page);
+ put_page(page);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+
+#define DAMON_MAX_SUBSCORE (100)
+#define DAMON_MAX_AGE_IN_LOG (32)
+
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s)
+{
+ unsigned int max_nr_accesses;
+ int freq_subscore;
+ unsigned int age_in_sec;
+ int age_in_log, age_subscore;
+ unsigned int freq_weight = s->quota.weight_nr_accesses;
+ unsigned int age_weight = s->quota.weight_age;
+ int hotness;
+
+ max_nr_accesses = c->aggr_interval / c->sample_interval;
+ freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+ age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+ for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+ age_in_log++, age_in_sec >>= 1)
+ ;
+
+ /* If frequency is 0, higher age means it's colder */
+ if (freq_subscore == 0)
+ age_in_log *= -1;
+
+ /*
+ * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+ * Scale it to be in [0, 100] and set it as age subscore.
+ */
+ age_in_log += DAMON_MAX_AGE_IN_LOG;
+ age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+ DAMON_MAX_AGE_IN_LOG / 2;
+
+ hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+ if (freq_weight + age_weight)
+ hotness /= freq_weight + age_weight;
+ /*
+ * Transform it to fit in [0, DAMOS_MAX_SCORE]
+ */
+ hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+ /* Return coldness of the region */
+ return DAMOS_MAX_SCORE - hotness;
+}
diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h
new file mode 100644
index 000000000000..61f27037603e
--- /dev/null
+++ b/mm/damon/prmtv-common.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Common Primitives for Data Access Monitoring
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#include <linux/damon.h>
+#include <linux/random.h>
+
+/* Get a random number in [l, r) */
+#define damon_rand(l, r) (l + prandom_u32_max(r - l))
+
+struct page *damon_get_page(unsigned long pfn);
+
+void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr);
+void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
+
+int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s);
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
new file mode 100644
index 000000000000..dc1485044eaf
--- /dev/null
+++ b/mm/damon/reclaim.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based page reclamation
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-reclaim: " fmt
+
+#include <linux/damon.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_reclaim."
+
+/*
+ * Enable or disable DAMON_RECLAIM.
+ *
+ * You can enable DAMON_RCLAIM by setting the value of this parameter as ``Y``.
+ * Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could
+ * do no real monitoring and reclamation due to the watermarks-based activation
+ * condition. Refer to below descriptions for the watermarks parameter for
+ * this.
+ */
+static bool enabled __read_mostly;
+module_param(enabled, bool, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_RECLAIM
+ * identifies the region as cold, and reclaims. 120 seconds by default.
+ */
+static unsigned long min_age __read_mostly = 120000000;
+module_param(min_age, ulong, 0600);
+
+/*
+ * Limit of time for trying the reclamation in milliseconds.
+ *
+ * DAMON_RECLAIM tries to use only up to this time within a time window
+ * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be
+ * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero,
+ * the limit is disabled.
+ *
+ * 10 ms by default.
+ */
+static unsigned long quota_ms __read_mostly = 10;
+module_param(quota_ms, ulong, 0600);
+
+/*
+ * Limit of size of memory for the reclamation in bytes.
+ *
+ * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a
+ * time window (quota_reset_interval_ms) and makes no more than this limit is
+ * tried. This can be used for limiting consumption of CPU and IO. If this
+ * value is zero, the limit is disabled.
+ *
+ * 128 MiB by default.
+ */
+static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024;
+module_param(quota_sz, ulong, 0600);
+
+/*
+ * The time/size quota charge reset interval in milliseconds.
+ *
+ * The charge reset interval for the quota of time (quota_ms) and size
+ * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than
+ * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms
+ * milliseconds.
+ *
+ * 1 second by default.
+ */
+static unsigned long quota_reset_interval_ms __read_mostly = 1000;
+module_param(quota_reset_interval_ms, ulong, 0600);
+
+/*
+ * The watermarks check time interval in microseconds.
+ *
+ * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is
+ * enabled but inactive due to its watermarks rule. 5 seconds by default.
+ */
+static unsigned long wmarks_interval __read_mostly = 5000000;
+module_param(wmarks_interval, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the high watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is higher than
+ * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically
+ * checks the watermarks. 500 (50%) by default.
+ */
+static unsigned long wmarks_high __read_mostly = 500;
+module_param(wmarks_high, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the middle watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is between this and
+ * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring
+ * and the reclaiming. 400 (40%) by default.
+ */
+static unsigned long wmarks_mid __read_mostly = 400;
+module_param(wmarks_mid, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the low watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is lower than this,
+ * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks
+ * the watermarks. In the case, the system falls back to the LRU-based page
+ * granularity reclamation logic. 200 (20%) by default.
+ */
+static unsigned long wmarks_low __read_mostly = 200;
+module_param(wmarks_low, ulong, 0600);
+
+/*
+ * Sampling interval for the monitoring in microseconds.
+ *
+ * The sampling interval of DAMON for the cold memory monitoring. Please refer
+ * to the DAMON documentation for more detail. 5 ms by default.
+ */
+static unsigned long sample_interval __read_mostly = 5000;
+module_param(sample_interval, ulong, 0600);
+
+/*
+ * Aggregation interval for the monitoring in microseconds.
+ *
+ * The aggregation interval of DAMON for the cold memory monitoring. Please
+ * refer to the DAMON documentation for more detail. 100 ms by default.
+ */
+static unsigned long aggr_interval __read_mostly = 100000;
+module_param(aggr_interval, ulong, 0600);
+
+/*
+ * Minimum number of monitoring regions.
+ *
+ * The minimal number of monitoring regions of DAMON for the cold memory
+ * monitoring. This can be used to set lower-bound of the monitoring quality.
+ * But, setting this too high could result in increased monitoring overhead.
+ * Please refer to the DAMON documentation for more detail. 10 by default.
+ */
+static unsigned long min_nr_regions __read_mostly = 10;
+module_param(min_nr_regions, ulong, 0600);
+
+/*
+ * Maximum number of monitoring regions.
+ *
+ * The maximum number of monitoring regions of DAMON for the cold memory
+ * monitoring. This can be used to set upper-bound of the monitoring overhead.
+ * However, setting this too low could result in bad monitoring quality.
+ * Please refer to the DAMON documentation for more detail. 1000 by default.
+ */
+static unsigned long max_nr_regions __read_mostly = 1000;
+module_param(max_nr_regions, ulong, 0600);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_RECLAIM will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_RECLAIM will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+struct damon_reclaim_ram_walk_arg {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+ struct damon_reclaim_ram_walk_arg *a = arg;
+
+ if (a->end - a->start < res->end - res->start) {
+ a->start = res->start;
+ a->end = res->end;
+ }
+ return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively. If no System RAM is found, returns false.
+ */
+static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+{
+ struct damon_reclaim_ram_walk_arg arg = {};
+
+ walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+ if (arg.end <= arg.start)
+ return false;
+
+ *start = arg.start;
+ *end = arg.end;
+ return true;
+}
+
+static struct damos *damon_reclaim_new_scheme(void)
+{
+ struct damos_watermarks wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = wmarks_interval,
+ .high = wmarks_high,
+ .mid = wmarks_mid,
+ .low = wmarks_low,
+ };
+ struct damos_quota quota = {
+ /*
+ * Do not try reclamation for more than quota_ms milliseconds
+ * or quota_sz bytes within quota_reset_interval_ms.
+ */
+ .ms = quota_ms,
+ .sz = quota_sz,
+ .reset_interval = quota_reset_interval_ms,
+ /* Within the quota, page out older regions first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 0,
+ .weight_age = 1
+ };
+ struct damos *scheme = damon_new_scheme(
+ /* Find regions having PAGE_SIZE or larger size */
+ PAGE_SIZE, ULONG_MAX,
+ /* and not accessed at all */
+ 0, 0,
+ /* for min_age or more micro-seconds, and */
+ min_age / aggr_interval, UINT_MAX,
+ /* page out those, as soon as found */
+ DAMOS_PAGEOUT,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &wmarks);
+
+ return scheme;
+}
+
+static int damon_reclaim_turn(bool on)
+{
+ struct damon_region *region;
+ struct damos *scheme;
+ int err;
+
+ if (!on) {
+ err = damon_stop(&ctx, 1);
+ if (!err)
+ kdamond_pid = -1;
+ return err;
+ }
+
+ err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+ min_nr_regions, max_nr_regions);
+ if (err)
+ return err;
+
+ if (monitor_region_start > monitor_region_end)
+ return -EINVAL;
+ if (!monitor_region_start && !monitor_region_end &&
+ !get_monitoring_region(&monitor_region_start,
+ &monitor_region_end))
+ return -EINVAL;
+ /* DAMON will free this on its own when finish monitoring */
+ region = damon_new_region(monitor_region_start, monitor_region_end);
+ if (!region)
+ return -ENOMEM;
+ damon_add_region(region, target);
+
+ /* Will be freed by 'damon_set_schemes()' below */
+ scheme = damon_reclaim_new_scheme();
+ if (!scheme) {
+ err = -ENOMEM;
+ goto free_region_out;
+ }
+ err = damon_set_schemes(ctx, &scheme, 1);
+ if (err)
+ goto free_scheme_out;
+
+ err = damon_start(&ctx, 1);
+ if (!err) {
+ kdamond_pid = ctx->kdamond->pid;
+ return 0;
+ }
+
+free_scheme_out:
+ damon_destroy_scheme(scheme);
+free_region_out:
+ damon_destroy_region(region, target);
+ return err;
+}
+
+#define ENABLE_CHECK_INTERVAL_MS 1000
+static struct delayed_work damon_reclaim_timer;
+static void damon_reclaim_timer_fn(struct work_struct *work)
+{
+ static bool last_enabled;
+ bool now_enabled;
+
+ now_enabled = enabled;
+ if (last_enabled != now_enabled) {
+ if (!damon_reclaim_turn(now_enabled))
+ last_enabled = now_enabled;
+ else
+ enabled = last_enabled;
+ }
+
+ schedule_delayed_work(&damon_reclaim_timer,
+ msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
+}
+static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
+
+static int __init damon_reclaim_init(void)
+{
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+
+ damon_pa_set_primitives(ctx);
+
+ /* 4242 means nothing but fun */
+ target = damon_new_target(4242);
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+
+ schedule_delayed_work(&damon_reclaim_timer, 0);
+ return 0;
+}
+
+module_init(damon_reclaim_init);
diff --git a/mm/damon/vaddr-test.h b/mm/damon/vaddr-test.h
index 1f5c13257dba..ecfd0b2ed222 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/vaddr-test.h
@@ -233,7 +233,7 @@ static void damon_test_apply_three_regions3(struct kunit *test)
* and 70-100) has totally freed and mapped to different area (30-32 and
* 65-68). The target regions which were in the old second and third big
* regions should now be removed and new target regions covering the new second
- * and third big regions should be crated.
+ * and third big regions should be created.
*/
static void damon_test_apply_three_regions4(struct kunit *test)
{
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 58c1fb2aafa9..35fe49080ee9 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -7,25 +7,20 @@
#define pr_fmt(fmt) "damon-va: " fmt
-#include <linux/damon.h>
+#include <asm-generic/mman-common.h>
+#include <linux/highmem.h>
#include <linux/hugetlb.h>
-#include <linux/mm.h>
#include <linux/mmu_notifier.h>
-#include <linux/highmem.h>
#include <linux/page_idle.h>
#include <linux/pagewalk.h>
-#include <linux/random.h>
-#include <linux/sched/mm.h>
-#include <linux/slab.h>
+
+#include "prmtv-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
#undef DAMON_MIN_REGION
#define DAMON_MIN_REGION 1
#endif
-/* Get a random number in [l, r) */
-#define damon_rand(l, r) (l + prandom_u32_max(r - l))
-
/*
* 't->id' should be the pointer to the relevant 'struct pid' having reference
* count. Caller must put the returned task, unless it is NULL.
@@ -311,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
struct damon_addr_range bregions[3])
{
struct damon_region *r, *next;
- unsigned int i = 0;
+ unsigned int i;
/* Remove regions which are not in the three big regions now */
damon_for_each_region_safe(r, next, t) {
@@ -372,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx)
}
}
-/*
- * Get an online page for a pfn if it's in the LRU list. Otherwise, returns
- * NULL.
- *
- * The body of this function is stolen from the 'page_idle_get_page()'. We
- * steal rather than reuse it because the code is quite simple.
- */
-static struct page *damon_get_page(unsigned long pfn)
-{
- struct page *page = pfn_to_online_page(pfn);
-
- if (!page || !PageLRU(page) || !get_page_unless_zero(page))
- return NULL;
-
- if (unlikely(!PageLRU(page))) {
- put_page(page);
- page = NULL;
- }
- return page;
-}
-
-static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm,
- unsigned long addr)
-{
- bool referenced = false;
- struct page *page = damon_get_page(pte_pfn(*pte));
-
- if (!page)
- return;
-
- if (pte_young(*pte)) {
- referenced = true;
- *pte = pte_mkold(*pte);
- }
-
-#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
- referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
- if (referenced)
- set_page_young(page);
-
- set_page_idle(page);
- put_page(page);
-}
-
-static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
- unsigned long addr)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- bool referenced = false;
- struct page *page = damon_get_page(pmd_pfn(*pmd));
-
- if (!page)
- return;
-
- if (pmd_young(*pmd)) {
- referenced = true;
- *pmd = pmd_mkold(*pmd);
- }
-
-#ifdef CONFIG_MMU_NOTIFIER
- if (mmu_notifier_clear_young(mm, addr,
- addr + ((1UL) << HPAGE_PMD_SHIFT)))
- referenced = true;
-#endif /* CONFIG_MMU_NOTIFIER */
-
- if (referenced)
- set_page_young(page);
-
- set_page_idle(page);
- put_page(page);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-}
-
static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
@@ -475,7 +394,7 @@ out:
return 0;
}
-static struct mm_walk_ops damon_mkold_ops = {
+static const struct mm_walk_ops damon_mkold_ops = {
.pmd_entry = damon_mkold_pmd_entry,
};
@@ -571,7 +490,7 @@ out:
return 0;
}
-static struct mm_walk_ops damon_young_ops = {
+static const struct mm_walk_ops damon_young_ops = {
.pmd_entry = damon_young_pmd_entry,
};
@@ -658,6 +577,76 @@ bool damon_va_target_valid(void *target)
return false;
}
+#ifndef CONFIG_ADVISE_SYSCALLS
+static int damos_madvise(struct damon_target *target, struct damon_region *r,
+ int behavior)
+{
+ return -EINVAL;
+}
+#else
+static int damos_madvise(struct damon_target *target, struct damon_region *r,
+ int behavior)
+{
+ struct mm_struct *mm;
+ int ret = -ENOMEM;
+
+ mm = damon_get_mm(target);
+ if (!mm)
+ goto out;
+
+ ret = do_madvise(mm, PAGE_ALIGN(r->ar.start),
+ PAGE_ALIGN(r->ar.end - r->ar.start), behavior);
+ mmput(mm);
+out:
+ return ret;
+}
+#endif /* CONFIG_ADVISE_SYSCALLS */
+
+int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
+ struct damon_region *r, struct damos *scheme)
+{
+ int madv_action;
+
+ switch (scheme->action) {
+ case DAMOS_WILLNEED:
+ madv_action = MADV_WILLNEED;
+ break;
+ case DAMOS_COLD:
+ madv_action = MADV_COLD;
+ break;
+ case DAMOS_PAGEOUT:
+ madv_action = MADV_PAGEOUT;
+ break;
+ case DAMOS_HUGEPAGE:
+ madv_action = MADV_HUGEPAGE;
+ break;
+ case DAMOS_NOHUGEPAGE:
+ madv_action = MADV_NOHUGEPAGE;
+ break;
+ case DAMOS_STAT:
+ return 0;
+ default:
+ pr_warn("Wrong action %d\n", scheme->action);
+ return -EINVAL;
+ }
+
+ return damos_madvise(t, r, madv_action);
+}
+
+int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
+ struct damon_region *r, struct damos *scheme)
+{
+
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_pageout_score(context, r, scheme);
+ default:
+ break;
+ }
+
+ return DAMOS_MAX_SCORE;
+}
+
void damon_va_set_primitives(struct damon_ctx *ctx)
{
ctx->primitive.init = damon_va_init;
@@ -667,6 +656,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
ctx->primitive.reset_aggregated = NULL;
ctx->primitive.target_valid = damon_va_target_valid;
ctx->primitive.cleanup = NULL;
+ ctx->primitive.apply_scheme = damon_va_apply_scheme;
+ ctx->primitive.get_scheme_score = damon_va_scheme_score;
}
#include "vaddr-test.h"
diff --git a/mm/debug.c b/mm/debug.c
index fae0f81ad831..a05a39ff8fe4 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -16,17 +16,19 @@
#include <linux/ctype.h>
#include "internal.h"
+#include <trace/events/migrate.h>
+
+/*
+ * Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
+ * be used to populate migrate_reason_names[].
+ */
+#undef EM
+#undef EMe
+#define EM(a, b) b,
+#define EMe(a, b) b
const char *migrate_reason_names[MR_TYPES] = {
- "compaction",
- "memory_failure",
- "memory_hotplug",
- "syscall_or_cpuset",
- "mempolicy_mbind",
- "numa_misplaced",
- "contig_range",
- "longterm_pin",
- "demotion",
+ MIGRATE_REASON
};
const struct trace_print_flags pageflag_names[] = {
@@ -162,7 +164,7 @@ static void __dump_page(struct page *page)
out_mapping:
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
+ pr_warn("%sflags: %pGp%s\n", type, &head->flags,
page_cma ? " CMA" : "");
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
@@ -216,7 +218,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
- "binfmt %px flags %lx core_state %px\n"
+ "binfmt %px flags %lx\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@@ -248,7 +250,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
- mm->binfmt, mm->flags, mm->core_state,
+ mm->binfmt, mm->flags,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1403639302e4..228e3954b90c 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1104,13 +1104,14 @@ static int __init init_args(struct pgtable_debug_args *args)
/*
* Initialize the debugging data.
*
- * __P000 (or even __S000) will help create page table entries with
- * PROT_NONE permission as required for pxx_protnone_tests().
+ * protection_map[0] (or even protection_map[8]) will help create
+ * page table entries with PROT_NONE permission as required for
+ * pxx_protnone_tests().
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
args->page_prot = vm_get_page_prot(VMFLAGS);
- args->page_prot_none = __P000;
+ args->page_prot_none = protection_map[0];
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
args->pmd_pfn = ULONG_MAX;
diff --git a/mm/filemap.c b/mm/filemap.c
index dae481293b5d..daa0e23a6ee6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,6 @@
#include <linux/writeback.h>
#include <linux/backing-dev.h>
#include <linux/pagevec.h>
-#include <linux/blkdev.h>
#include <linux/security.h>
#include <linux/cpuset.h>
#include <linux/hugetlb.h>
@@ -90,7 +89,7 @@
* ->lock_page (filemap_fault, access_process_vm)
*
* ->i_rwsem (generic_perform_write)
- * ->mmap_lock (fault_in_pages_readable->do_page_fault)
+ * ->mmap_lock (fault_in_readable->do_page_fault)
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
@@ -262,9 +261,13 @@ void delete_from_page_cache(struct page *page)
struct address_space *mapping = page_mapping(page);
BUG_ON(!PageLocked(page));
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__delete_from_page_cache(page, NULL);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
page_cache_free_page(mapping, page);
}
@@ -340,6 +343,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
if (!pagevec_count(pvec))
return;
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
for (i = 0; i < pagevec_count(pvec); i++) {
trace_mm_filemap_delete_from_page_cache(pvec->pages[i]);
@@ -348,6 +352,9 @@ void delete_from_page_cache_batch(struct address_space *mapping,
}
page_cache_delete_batch(mapping, pvec);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
for (i = 0; i < pagevec_count(pvec); i++)
page_cache_free_page(mapping, pvec->pages[i]);
@@ -639,6 +646,30 @@ static bool mapping_needs_writeback(struct address_space *mapping)
return mapping->nrpages;
}
+static bool filemap_range_has_writeback(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
+ struct page *page;
+
+ if (end_byte < start_byte)
+ return false;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, max) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
+ continue;
+ if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
+ break;
+ }
+ rcu_read_unlock();
+ return page != NULL;
+
+}
+
/**
* filemap_range_needs_writeback - check if range potentially needs writeback
* @mapping: address space within which to check
@@ -656,29 +687,12 @@ static bool mapping_needs_writeback(struct address_space *mapping)
bool filemap_range_needs_writeback(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
- pgoff_t max = end_byte >> PAGE_SHIFT;
- struct page *page;
-
if (!mapping_needs_writeback(mapping))
return false;
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
return false;
- if (end_byte < start_byte)
- return false;
-
- rcu_read_lock();
- xas_for_each(&xas, page, max) {
- if (xas_retry(&xas, page))
- continue;
- if (xa_is_value(page))
- continue;
- if (PageDirty(page) || PageLocked(page) || PageWriteback(page))
- break;
- }
- rcu_read_unlock();
- return page != NULL;
+ return filemap_range_has_writeback(mapping, start_byte, end_byte);
}
EXPORT_SYMBOL_GPL(filemap_range_needs_writeback);
@@ -835,6 +849,8 @@ EXPORT_SYMBOL(file_write_and_wait_range);
*/
void replace_page_cache_page(struct page *old, struct page *new)
{
+ struct folio *fold = page_folio(old);
+ struct folio *fnew = page_folio(new);
struct address_space *mapping = old->mapping;
void (*freepage)(struct page *) = mapping->a_ops->freepage;
pgoff_t offset = old->index;
@@ -848,7 +864,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
new->mapping = mapping;
new->index = offset;
- mem_cgroup_migrate(old, new);
+ mem_cgroup_migrate(fold, fnew);
xas_lock_irq(&xas);
xas_store(&xas, new);
@@ -870,26 +886,25 @@ void replace_page_cache_page(struct page *old, struct page *new)
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-noinline int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp,
- void **shadowp)
+noinline int __filemap_add_folio(struct address_space *mapping,
+ struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
{
- XA_STATE(xas, &mapping->i_pages, offset);
- int huge = PageHuge(page);
+ XA_STATE(xas, &mapping->i_pages, index);
+ int huge = folio_test_hugetlb(folio);
int error;
bool charged = false;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- get_page(page);
- page->mapping = mapping;
- page->index = offset;
+ folio_get(folio);
+ folio->mapping = mapping;
+ folio->index = index;
if (!huge) {
- error = mem_cgroup_charge(page, NULL, gfp);
+ error = mem_cgroup_charge(folio, NULL, gfp);
+ VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
goto error;
charged = true;
@@ -901,7 +916,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
unsigned int order = xa_get_order(xas.xa, xas.xa_index);
void *entry, *old = NULL;
- if (order > thp_order(page))
+ if (order > folio_order(folio))
xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
order, gfp);
xas_lock_irq(&xas);
@@ -918,13 +933,13 @@ noinline int __add_to_page_cache_locked(struct page *page,
*shadowp = old;
/* entry may have been split before we acquired lock */
order = xa_get_order(xas.xa, xas.xa_index);
- if (order > thp_order(page)) {
+ if (order > folio_order(folio)) {
xas_split(&xas, old, order);
xas_reset(&xas);
}
}
- xas_store(&xas, page);
+ xas_store(&xas, folio);
if (xas_error(&xas))
goto unlock;
@@ -932,7 +947,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
- __inc_lruvec_page_state(page, NR_FILE_PAGES);
+ __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -940,19 +955,19 @@ unlock:
if (xas_error(&xas)) {
error = xas_error(&xas);
if (charged)
- mem_cgroup_uncharge(page);
+ mem_cgroup_uncharge(folio);
goto error;
}
- trace_mm_filemap_add_to_page_cache(page);
+ trace_mm_filemap_add_to_page_cache(&folio->page);
return 0;
error:
- page->mapping = NULL;
+ folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- put_page(page);
+ folio_put(folio);
return error;
}
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -969,59 +984,58 @@ ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
- return __add_to_page_cache_locked(page, mapping, offset,
+ return __filemap_add_folio(mapping, page_folio(page), offset,
gfp_mask, NULL);
}
EXPORT_SYMBOL(add_to_page_cache_locked);
-int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask)
+int filemap_add_folio(struct address_space *mapping, struct folio *folio,
+ pgoff_t index, gfp_t gfp)
{
void *shadow = NULL;
int ret;
- __SetPageLocked(page);
- ret = __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, &shadow);
+ __folio_set_locked(folio);
+ ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
if (unlikely(ret))
- __ClearPageLocked(page);
+ __folio_clear_locked(folio);
else {
/*
- * The page might have been evicted from cache only
+ * The folio might have been evicted from cache only
* recently, in which case it should be activated like
- * any other repeatedly accessed page.
- * The exception is pages getting rewritten; evicting other
+ * any other repeatedly accessed folio.
+ * The exception is folios getting rewritten; evicting other
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- WARN_ON_ONCE(PageActive(page));
- if (!(gfp_mask & __GFP_WRITE) && shadow)
- workingset_refault(page, shadow);
- lru_cache_add(page);
+ WARN_ON_ONCE(folio_test_active(folio));
+ if (!(gfp & __GFP_WRITE) && shadow)
+ workingset_refault(folio, shadow);
+ folio_add_lru(folio);
}
return ret;
}
-EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
+EXPORT_SYMBOL_GPL(filemap_add_folio);
#ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
{
int n;
- struct page *page;
+ struct folio *folio;
if (cpuset_do_page_mem_spread()) {
unsigned int cpuset_mems_cookie;
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = __alloc_pages_node(n, gfp, 0);
- } while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
+ folio = __folio_alloc_node(gfp, order, n);
+ } while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
- return page;
+ return folio;
}
- return alloc_pages(gfp, 0);
+ return folio_alloc(gfp, order);
}
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(filemap_alloc_folio);
#endif
/*
@@ -1074,11 +1088,11 @@ EXPORT_SYMBOL(filemap_invalidate_unlock_two);
*/
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
-static wait_queue_head_t page_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
+static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
-static wait_queue_head_t *page_waitqueue(struct page *page)
+static wait_queue_head_t *folio_waitqueue(struct folio *folio)
{
- return &page_wait_table[hash_ptr(page, PAGE_WAIT_TABLE_BITS)];
+ return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
}
void __init pagecache_init(void)
@@ -1086,7 +1100,7 @@ void __init pagecache_init(void)
int i;
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
- init_waitqueue_head(&page_wait_table[i]);
+ init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
}
@@ -1141,10 +1155,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*/
flags = wait->flags;
if (flags & WQ_FLAG_EXCLUSIVE) {
- if (test_bit(key->bit_nr, &key->page->flags))
+ if (test_bit(key->bit_nr, &key->folio->flags))
return -1;
if (flags & WQ_FLAG_CUSTOM) {
- if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ if (test_and_set_bit(key->bit_nr, &key->folio->flags))
return -1;
flags |= WQ_FLAG_DONE;
}
@@ -1157,7 +1171,7 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
*
* So update the flags atomically, and wake up the waiter
* afterwards to avoid any races. This store-release pairs
- * with the load-acquire in wait_on_page_bit_common().
+ * with the load-acquire in folio_wait_bit_common().
*/
smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
wake_up_state(wait->private, mode);
@@ -1176,14 +1190,14 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
-static void wake_up_page_bit(struct page *page, int bit_nr)
+static void folio_wake_bit(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
+ wait_queue_head_t *q = folio_waitqueue(folio);
struct wait_page_key key;
unsigned long flags;
wait_queue_entry_t bookmark;
- key.page = page;
+ key.folio = folio;
key.bit_nr = bit_nr;
key.page_match = 0;
@@ -1218,7 +1232,7 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
* page waiters.
*/
if (!waitqueue_active(q) || !key.page_match) {
- ClearPageWaiters(page);
+ folio_clear_waiters(folio);
/*
* It's possible to miss clearing Waiters here, when we woke
* our page waiters, but the hashed waitqueue has waiters for
@@ -1230,19 +1244,19 @@ static void wake_up_page_bit(struct page *page, int bit_nr)
spin_unlock_irqrestore(&q->lock, flags);
}
-static void wake_up_page(struct page *page, int bit)
+static void folio_wake(struct folio *folio, int bit)
{
- if (!PageWaiters(page))
+ if (!folio_test_waiters(folio))
return;
- wake_up_page_bit(page, bit);
+ folio_wake_bit(folio, bit);
}
/*
- * A choice of three behaviors for wait_on_page_bit_common():
+ * A choice of three behaviors for folio_wait_bit_common():
*/
enum behavior {
EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
- * __lock_page() waiting on then setting PG_locked.
+ * __folio_lock() waiting on then setting PG_locked.
*/
SHARED, /* Hold ref to page and check the bit when woken, like
* wait_on_page_writeback() waiting on PG_writeback.
@@ -1253,16 +1267,16 @@ enum behavior {
};
/*
- * Attempt to check (or get) the page bit, and mark us done
+ * Attempt to check (or get) the folio flag, and mark us done
* if successful.
*/
-static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
struct wait_queue_entry *wait)
{
if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- if (test_and_set_bit(bit_nr, &page->flags))
+ if (test_and_set_bit(bit_nr, &folio->flags))
return false;
- } else if (test_bit(bit_nr, &page->flags))
+ } else if (test_bit(bit_nr, &folio->flags))
return false;
wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
@@ -1272,9 +1286,10 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;
-static inline int wait_on_page_bit_common(wait_queue_head_t *q,
- struct page *page, int bit_nr, int state, enum behavior behavior)
+static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
+ int state, enum behavior behavior)
{
+ wait_queue_head_t *q = folio_waitqueue(folio);
int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
@@ -1283,8 +1298,8 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
unsigned long pflags;
if (bit_nr == PG_locked &&
- !PageUptodate(page) && PageWorkingset(page)) {
- if (!PageSwapBacked(page)) {
+ !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
+ if (!folio_test_swapbacked(folio)) {
delayacct_thrashing_start();
delayacct = true;
}
@@ -1294,7 +1309,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
init_wait(wait);
wait->func = wake_page_function;
- wait_page.page = page;
+ wait_page.folio = folio;
wait_page.bit_nr = bit_nr;
repeat:
@@ -1309,7 +1324,7 @@ repeat:
* Do one last check whether we can get the
* page bit synchronously.
*
- * Do the SetPageWaiters() marking before that
+ * Do the folio_set_waiters() marking before that
* to let any waker we _just_ missed know they
* need to wake us up (otherwise they'll never
* even go to the slow case that looks at the
@@ -1320,8 +1335,8 @@ repeat:
* lock to avoid races.
*/
spin_lock_irq(&q->lock);
- SetPageWaiters(page);
- if (!trylock_page_bit_common(page, bit_nr, wait))
+ folio_set_waiters(folio);
+ if (!folio_trylock_flag(folio, bit_nr, wait))
__add_wait_queue_entry_tail(q, wait);
spin_unlock_irq(&q->lock);
@@ -1331,10 +1346,10 @@ repeat:
* see whether the page bit testing has already
* been done by the wake function.
*
- * We can drop our reference to the page.
+ * We can drop our reference to the folio.
*/
if (behavior == DROP)
- put_page(page);
+ folio_put(folio);
/*
* Note that until the "finish_wait()", or until
@@ -1371,7 +1386,7 @@ repeat:
*
* And if that fails, we'll have to retry this all.
*/
- if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
goto repeat;
wait->flags |= WQ_FLAG_DONE;
@@ -1380,7 +1395,7 @@ repeat:
/*
* If a signal happened, this 'finish_wait()' may remove the last
- * waiter from the wait-queues, but the PageWaiters bit will remain
+ * waiter from the wait-queues, but the folio waiters bit will remain
* set. That's ok. The next wakeup will take care of it, and trying
* to do it here would be difficult and prone to races.
*/
@@ -1411,19 +1426,17 @@ repeat:
return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
-void wait_on_page_bit(struct page *page, int bit_nr)
+void folio_wait_bit(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
+ folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
-EXPORT_SYMBOL(wait_on_page_bit);
+EXPORT_SYMBOL(folio_wait_bit);
-int wait_on_page_bit_killable(struct page *page, int bit_nr)
+int folio_wait_bit_killable(struct folio *folio, int bit_nr)
{
- wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
+ return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
}
-EXPORT_SYMBOL(wait_on_page_bit_killable);
+EXPORT_SYMBOL(folio_wait_bit_killable);
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
@@ -1440,31 +1453,28 @@ EXPORT_SYMBOL(wait_on_page_bit_killable);
*/
int put_and_wait_on_page_locked(struct page *page, int state)
{
- wait_queue_head_t *q;
-
- page = compound_head(page);
- q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
+ return folio_wait_bit_common(page_folio(page), PG_locked, state,
+ DROP);
}
/**
- * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
- * @page: Page defining the wait queue of interest
+ * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
+ * @folio: Folio defining the wait queue of interest
* @waiter: Waiter to add to the queue
*
- * Add an arbitrary @waiter to the wait queue for the nominated @page.
+ * Add an arbitrary @waiter to the wait queue for the nominated @folio.
*/
-void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter)
+void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
{
- wait_queue_head_t *q = page_waitqueue(page);
+ wait_queue_head_t *q = folio_waitqueue(folio);
unsigned long flags;
spin_lock_irqsave(&q->lock, flags);
__add_wait_queue_entry_tail(q, waiter);
- SetPageWaiters(page);
+ folio_set_waiters(folio);
spin_unlock_irqrestore(&q->lock, flags);
}
-EXPORT_SYMBOL_GPL(add_page_wait_queue);
+EXPORT_SYMBOL_GPL(folio_add_wait_queue);
#ifndef clear_bit_unlock_is_negative_byte
@@ -1490,124 +1500,117 @@ static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem
#endif
/**
- * unlock_page - unlock a locked page
- * @page: the page
+ * folio_unlock - Unlock a locked folio.
+ * @folio: The folio.
*
- * Unlocks the page and wakes up sleepers in wait_on_page_locked().
- * Also wakes sleepers in wait_on_page_writeback() because the wakeup
- * mechanism between PageLocked pages and PageWriteback pages is shared.
- * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
+ * Unlocks the folio and wakes up any thread sleeping on the page lock.
*
- * Note that this depends on PG_waiters being the sign bit in the byte
- * that contains PG_locked - thus the BUILD_BUG_ON(). That allows us to
- * clear the PG_locked bit and test PG_waiters at the same time fairly
- * portably (architectures that do LL/SC can test any bit, while x86 can
- * test the sign bit).
+ * Context: May be called from interrupt or process context. May not be
+ * called from NMI context.
*/
-void unlock_page(struct page *page)
+void folio_unlock(struct folio *folio)
{
+ /* Bit 7 allows x86 to check the byte's sign bit */
BUILD_BUG_ON(PG_waiters != 7);
- page = compound_head(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (clear_bit_unlock_is_negative_byte(PG_locked, &page->flags))
- wake_up_page_bit(page, PG_locked);
+ BUILD_BUG_ON(PG_locked > 7);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ if (clear_bit_unlock_is_negative_byte(PG_locked, folio_flags(folio, 0)))
+ folio_wake_bit(folio, PG_locked);
}
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(folio_unlock);
/**
- * end_page_private_2 - Clear PG_private_2 and release any waiters
- * @page: The page
+ * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
+ * @folio: The folio.
*
- * Clear the PG_private_2 bit on a page and wake up any sleepers waiting for
- * this. The page ref held for PG_private_2 being set is released.
+ * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
+ * it. The folio reference held for PG_private_2 being set is released.
*
- * This is, for example, used when a netfs page is being written to a local
- * disk cache, thereby allowing writes to the cache for the same page to be
+ * This is, for example, used when a netfs folio is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same folio to be
* serialised.
*/
-void end_page_private_2(struct page *page)
+void folio_end_private_2(struct folio *folio)
{
- page = compound_head(page);
- VM_BUG_ON_PAGE(!PagePrivate2(page), page);
- clear_bit_unlock(PG_private_2, &page->flags);
- wake_up_page_bit(page, PG_private_2);
- put_page(page);
+ VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
+ clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
+ folio_wake_bit(folio, PG_private_2);
+ folio_put(folio);
}
-EXPORT_SYMBOL(end_page_private_2);
+EXPORT_SYMBOL(folio_end_private_2);
/**
- * wait_on_page_private_2 - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
*
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page.
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
*/
-void wait_on_page_private_2(struct page *page)
+void folio_wait_private_2(struct folio *folio)
{
- page = compound_head(page);
- while (PagePrivate2(page))
- wait_on_page_bit(page, PG_private_2);
+ while (folio_test_private_2(folio))
+ folio_wait_bit(folio, PG_private_2);
}
-EXPORT_SYMBOL(wait_on_page_private_2);
+EXPORT_SYMBOL(folio_wait_private_2);
/**
- * wait_on_page_private_2_killable - Wait for PG_private_2 to be cleared on a page
- * @page: The page to wait on
+ * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
+ * @folio: The folio to wait on.
*
- * Wait for PG_private_2 (aka PG_fscache) to be cleared on a page or until a
+ * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
* fatal signal is received by the calling task.
*
* Return:
* - 0 if successful.
* - -EINTR if a fatal signal was encountered.
*/
-int wait_on_page_private_2_killable(struct page *page)
+int folio_wait_private_2_killable(struct folio *folio)
{
int ret = 0;
- page = compound_head(page);
- while (PagePrivate2(page)) {
- ret = wait_on_page_bit_killable(page, PG_private_2);
+ while (folio_test_private_2(folio)) {
+ ret = folio_wait_bit_killable(folio, PG_private_2);
if (ret < 0)
break;
}
return ret;
}
-EXPORT_SYMBOL(wait_on_page_private_2_killable);
+EXPORT_SYMBOL(folio_wait_private_2_killable);
/**
- * end_page_writeback - end writeback against a page
- * @page: the page
+ * folio_end_writeback - End writeback against a folio.
+ * @folio: The folio.
*/
-void end_page_writeback(struct page *page)
+void folio_end_writeback(struct folio *folio)
{
/*
- * TestClearPageReclaim could be used here but it is an atomic
- * operation and overkill in this particular case. Failing to
- * shuffle a page marked for immediate reclaim is too mild to
- * justify taking an atomic operation penalty at the end of
- * ever page writeback.
+ * folio_test_clear_reclaim() could be used here but it is an
+ * atomic operation and overkill in this particular case. Failing
+ * to shuffle a folio marked for immediate reclaim is too mild
+ * a gain to justify taking an atomic operation penalty at the
+ * end of every folio writeback.
*/
- if (PageReclaim(page)) {
- ClearPageReclaim(page);
- rotate_reclaimable_page(page);
+ if (folio_test_reclaim(folio)) {
+ folio_clear_reclaim(folio);
+ folio_rotate_reclaimable(folio);
}
/*
- * Writeback does not hold a page reference of its own, relying
+ * Writeback does not hold a folio reference of its own, relying
* on truncation to wait for the clearing of PG_writeback.
- * But here we must make sure that the page is not freed and
- * reused before the wake_up_page().
+ * But here we must make sure that the folio is not freed and
+ * reused before the folio_wake().
*/
- get_page(page);
- if (!test_clear_page_writeback(page))
+ folio_get(folio);
+ if (!__folio_end_writeback(folio))
BUG();
smp_mb__after_atomic();
- wake_up_page(page, PG_writeback);
- put_page(page);
+ folio_wake(folio, PG_writeback);
+ acct_reclaim_writeback(folio);
+ folio_put(folio);
}
-EXPORT_SYMBOL(end_page_writeback);
+EXPORT_SYMBOL(folio_end_writeback);
/*
* After completing I/O on a page, call this routine to update the page
@@ -1638,39 +1641,35 @@ void page_endio(struct page *page, bool is_write, int err)
EXPORT_SYMBOL_GPL(page_endio);
/**
- * __lock_page - get a lock on the page, assuming we need to sleep to get it
- * @__page: the page to lock
+ * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
+ * @folio: The folio to lock
*/
-void __lock_page(struct page *__page)
+void __folio_lock(struct folio *folio)
{
- struct page *page = compound_head(__page);
- wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
EXCLUSIVE);
}
-EXPORT_SYMBOL(__lock_page);
+EXPORT_SYMBOL(__folio_lock);
-int __lock_page_killable(struct page *__page)
+int __folio_lock_killable(struct folio *folio)
{
- struct page *page = compound_head(__page);
- wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
EXCLUSIVE);
}
-EXPORT_SYMBOL_GPL(__lock_page_killable);
+EXPORT_SYMBOL_GPL(__folio_lock_killable);
-int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
{
- struct wait_queue_head *q = page_waitqueue(page);
+ struct wait_queue_head *q = folio_waitqueue(folio);
int ret = 0;
- wait->page = page;
+ wait->folio = folio;
wait->bit_nr = PG_locked;
spin_lock_irq(&q->lock);
__add_wait_queue_entry_tail(q, &wait->wait);
- SetPageWaiters(page);
- ret = !trylock_page(page);
+ folio_set_waiters(folio);
+ ret = !folio_trylock(folio);
/*
* If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
@@ -1687,16 +1686,16 @@ int __lock_page_async(struct page *page, struct wait_page_queue *wait)
/*
* Return values:
- * 1 - page is locked; mmap_lock is still held.
- * 0 - page is not locked.
+ * true - folio is locked; mmap_lock is still held.
+ * false - folio is not locked.
* mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
* which case mmap_lock is still held.
*
- * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_lock unperturbed.
+ * If neither ALLOW_RETRY nor KILLABLE are set, will always return true
+ * with the folio locked and the mmap_lock unperturbed.
*/
-int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm,
unsigned int flags)
{
if (fault_flag_allow_retry_first(flags)) {
@@ -1705,28 +1704,28 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
* even though return 0.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
- return 0;
+ return false;
mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
- wait_on_page_locked_killable(page);
+ folio_wait_locked_killable(folio);
else
- wait_on_page_locked(page);
- return 0;
+ folio_wait_locked(folio);
+ return false;
}
if (flags & FAULT_FLAG_KILLABLE) {
- int ret;
+ bool ret;
- ret = __lock_page_killable(page);
+ ret = __folio_lock_killable(folio);
if (ret) {
mmap_read_unlock(mm);
- return 0;
+ return false;
}
} else {
- __lock_page(page);
+ __folio_lock(folio);
}
- return 1;
+ return true;
}
/**
@@ -1802,143 +1801,155 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
EXPORT_SYMBOL(page_cache_prev_miss);
/*
+ * Lockless page cache protocol:
+ * On the lookup side:
+ * 1. Load the folio from i_pages
+ * 2. Increment the refcount if it's not zero
+ * 3. If the folio is not found by xas_reload(), put the refcount and retry
+ *
+ * On the removal side:
+ * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
+ * B. Remove the page from i_pages
+ * C. Return the page to the page allocator
+ *
+ * This means that any page may have its reference count temporarily
+ * increased by a speculative page cache (or fast GUP) lookup as it can
+ * be allocated by another user before the RCU grace period expires.
+ * Because the refcount temporarily acquired here may end up being the
+ * last refcount on the page, any page allocation must be freeable by
+ * folio_put().
+ */
+
+/*
* mapping_get_entry - Get a page cache entry.
* @mapping: the address_space to search
* @index: The page cache index.
*
- * Looks up the page cache slot at @mapping & @index. If there is a
- * page cache page, the head page is returned with an increased refcount.
+ * Looks up the page cache entry at @mapping & @index. If it is a folio,
+ * it is returned with an increased refcount. If it is a shadow entry
+ * of a previously evicted folio, or a swap entry from shmem/tmpfs,
+ * it is returned without further action.
*
- * If the slot holds a shadow entry of a previously evicted page, or a
- * swap entry from shmem/tmpfs, it is returned.
- *
- * Return: The head page or shadow entry, %NULL if nothing is found.
+ * Return: The folio, swap or shadow entry, %NULL if nothing is found.
*/
-static struct page *mapping_get_entry(struct address_space *mapping,
- pgoff_t index)
+static void *mapping_get_entry(struct address_space *mapping, pgoff_t index)
{
XA_STATE(xas, &mapping->i_pages, index);
- struct page *page;
+ struct folio *folio;
rcu_read_lock();
repeat:
xas_reset(&xas);
- page = xas_load(&xas);
- if (xas_retry(&xas, page))
+ folio = xas_load(&xas);
+ if (xas_retry(&xas, folio))
goto repeat;
/*
* A shadow entry of a recently evicted page, or a swap entry from
* shmem/tmpfs. Return it without attempting to raise page count.
*/
- if (!page || xa_is_value(page))
+ if (!folio || xa_is_value(folio))
goto out;
- if (!page_cache_get_speculative(page))
+ if (!folio_try_get_rcu(folio))
goto repeat;
- /*
- * Has the page moved or been split?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != xas_reload(&xas))) {
- put_page(page);
+ if (unlikely(folio != xas_reload(&xas))) {
+ folio_put(folio);
goto repeat;
}
out:
rcu_read_unlock();
- return page;
+ return folio;
}
/**
- * pagecache_get_page - Find and get a reference to a page.
+ * __filemap_get_folio - Find and get a reference to a folio.
* @mapping: The address_space to search.
* @index: The page index.
- * @fgp_flags: %FGP flags modify how the page is returned.
- * @gfp_mask: Memory allocation flags to use if %FGP_CREAT is specified.
+ * @fgp_flags: %FGP flags modify how the folio is returned.
+ * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
*
* Looks up the page cache entry at @mapping & @index.
*
* @fgp_flags can be zero or more of these flags:
*
- * * %FGP_ACCESSED - The page will be marked accessed.
- * * %FGP_LOCK - The page is returned locked.
- * * %FGP_HEAD - If the page is present and a THP, return the head page
- * rather than the exact page specified by the index.
+ * * %FGP_ACCESSED - The folio will be marked accessed.
+ * * %FGP_LOCK - The folio is returned locked.
* * %FGP_ENTRY - If there is a shadow / swap / DAX entry, return it
- * instead of allocating a new page to replace it.
+ * instead of allocating a new folio to replace it.
* * %FGP_CREAT - If no page is present then a new page is allocated using
- * @gfp_mask and added to the page cache and the VM's LRU list.
+ * @gfp and added to the page cache and the VM's LRU list.
* The page is returned locked and with an increased refcount.
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
- * * %FGP_WRITE - The page will be written
- * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
- * * %FGP_NOWAIT - Don't get blocked by page lock
+ * * %FGP_WRITE - The page will be written to by the caller.
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp.
+ * * %FGP_NOWAIT - Don't get blocked by page lock.
+ * * %FGP_STABLE - Wait for the folio to be stable (finished writeback)
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
*
- * Return: The found page or %NULL otherwise.
+ * Return: The found folio or %NULL otherwise.
*/
-struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp_mask)
+struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp)
{
- struct page *page;
+ struct folio *folio;
repeat:
- page = mapping_get_entry(mapping, index);
- if (xa_is_value(page)) {
+ folio = mapping_get_entry(mapping, index);
+ if (xa_is_value(folio)) {
if (fgp_flags & FGP_ENTRY)
- return page;
- page = NULL;
+ return folio;
+ folio = NULL;
}
- if (!page)
+ if (!folio)
goto no_page;
if (fgp_flags & FGP_LOCK) {
if (fgp_flags & FGP_NOWAIT) {
- if (!trylock_page(page)) {
- put_page(page);
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
return NULL;
}
} else {
- lock_page(page);
+ folio_lock(folio);
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
- unlock_page(page);
- put_page(page);
+ if (unlikely(folio->mapping != mapping)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto repeat;
}
- VM_BUG_ON_PAGE(!thp_contains(page, index), page);
+ VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
}
if (fgp_flags & FGP_ACCESSED)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
else if (fgp_flags & FGP_WRITE) {
/* Clear idle flag for buffer write */
- if (page_is_idle(page))
- clear_page_idle(page);
+ if (folio_test_idle(folio))
+ folio_clear_idle(folio);
}
- if (!(fgp_flags & FGP_HEAD))
- page = find_subpage(page, index);
+ if (fgp_flags & FGP_STABLE)
+ folio_wait_stable(folio);
no_page:
- if (!page && (fgp_flags & FGP_CREAT)) {
+ if (!folio && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
- gfp_mask |= __GFP_WRITE;
+ gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
- gfp_mask &= ~__GFP_FS;
+ gfp &= ~__GFP_FS;
- page = __page_cache_alloc(gfp_mask);
- if (!page)
+ folio = filemap_alloc_folio(gfp, 0);
+ if (!folio)
return NULL;
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
@@ -1946,27 +1957,27 @@ no_page:
/* Init accessed so avoid atomic mark_page_accessed later */
if (fgp_flags & FGP_ACCESSED)
- __SetPageReferenced(page);
+ __folio_set_referenced(folio);
- err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
+ err = filemap_add_folio(mapping, folio, index, gfp);
if (unlikely(err)) {
- put_page(page);
- page = NULL;
+ folio_put(folio);
+ folio = NULL;
if (err == -EEXIST)
goto repeat;
}
/*
- * add_to_page_cache_lru locks the page, and for mmap we expect
- * an unlocked page.
+ * filemap_add_folio locks the page, and for mmap
+ * we expect an unlocked page.
*/
- if (page && (fgp_flags & FGP_FOR_MMAP))
- unlock_page(page);
+ if (folio && (fgp_flags & FGP_FOR_MMAP))
+ folio_unlock(folio);
}
- return page;
+ return folio;
}
-EXPORT_SYMBOL(pagecache_get_page);
+EXPORT_SYMBOL(__filemap_get_folio);
static inline struct page *find_get_entry(struct xa_state *xas, pgoff_t max,
xa_mark_t mark)
@@ -2093,7 +2104,6 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
if (!xa_is_value(page)) {
if (page->index < start)
goto put;
- VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
if (page->index + thp_nr_pages(page) - 1 > end)
goto put;
if (!trylock_page(page))
@@ -2421,6 +2431,7 @@ static int filemap_update_page(struct kiocb *iocb,
struct address_space *mapping, struct iov_iter *iter,
struct page *page)
{
+ struct folio *folio = page_folio(page);
int error;
if (iocb->ki_flags & IOCB_NOWAIT) {
@@ -2430,40 +2441,40 @@ static int filemap_update_page(struct kiocb *iocb,
filemap_invalidate_lock_shared(mapping);
}
- if (!trylock_page(page)) {
+ if (!folio_trylock(folio)) {
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
goto unlock_mapping;
if (!(iocb->ki_flags & IOCB_WAITQ)) {
filemap_invalidate_unlock_shared(mapping);
- put_and_wait_on_page_locked(page, TASK_KILLABLE);
+ put_and_wait_on_page_locked(&folio->page, TASK_KILLABLE);
return AOP_TRUNCATED_PAGE;
}
- error = __lock_page_async(page, iocb->ki_waitq);
+ error = __folio_lock_async(folio, iocb->ki_waitq);
if (error)
goto unlock_mapping;
}
error = AOP_TRUNCATED_PAGE;
- if (!page->mapping)
+ if (!folio->mapping)
goto unlock;
error = 0;
- if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, page))
+ if (filemap_range_uptodate(mapping, iocb->ki_pos, iter, &folio->page))
goto unlock;
error = -EAGAIN;
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
goto unlock;
- error = filemap_read_page(iocb->ki_filp, mapping, page);
+ error = filemap_read_page(iocb->ki_filp, mapping, &folio->page);
goto unlock_mapping;
unlock:
- unlock_page(page);
+ folio_unlock(folio);
unlock_mapping:
filemap_invalidate_unlock_shared(mapping);
if (error == AOP_TRUNCATED_PAGE)
- put_page(page);
+ folio_put(folio);
return error;
}
@@ -2625,6 +2636,9 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
iocb->ki_flags |= IOCB_NOWAIT;
+ if (unlikely(iocb->ki_pos >= i_size_read(inode)))
+ break;
+
error = filemap_get_pages(iocb, iter, &pvec);
if (error < 0)
break;
@@ -2737,9 +2751,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- loff_t size;
- size = i_size_read(inode);
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1))
@@ -2771,8 +2783,9 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
- if (retval < 0 || !count || iocb->ki_pos >= size ||
- IS_DAX(inode))
+ if (retval < 0 || !count || IS_DAX(inode))
+ return retval;
+ if (iocb->ki_pos >= i_size_read(inode))
return retval;
}
@@ -2900,7 +2913,9 @@ unlock:
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
struct file **fpin)
{
- if (trylock_page(page))
+ struct folio *folio = page_folio(page);
+
+ if (folio_trylock(folio))
return 1;
/*
@@ -2913,7 +2928,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
if (vmf->flags & FAULT_FLAG_KILLABLE) {
- if (__lock_page_killable(page)) {
+ if (__folio_lock_killable(folio)) {
/*
* We didn't have the right flags to drop the mmap_lock,
* but all fault_handlers only check for fatal signals
@@ -2925,11 +2940,11 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
return 0;
}
} else
- __lock_page(page);
+ __folio_lock(folio);
+
return 1;
}
-
/*
* Synchronous readahead happens when we don't even find a page in the page
* cache at all. We don't want to perform IO under the mmap sem, so if we have
@@ -3195,24 +3210,17 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct page *page)
}
if (pmd_none(*vmf->pmd) && PageTransHuge(page)) {
- vm_fault_t ret = do_set_pmd(vmf, page);
- if (!ret) {
- /* The page is mapped successfully, reference consumed. */
- unlock_page(page);
- return true;
- }
- }
-
- if (pmd_none(*vmf->pmd)) {
- vmf->ptl = pmd_lock(mm, vmf->pmd);
- if (likely(pmd_none(*vmf->pmd))) {
- mm_inc_nr_ptes(mm);
- pmd_populate(mm, vmf->pmd, vmf->prealloc_pte);
- vmf->prealloc_pte = NULL;
+ vm_fault_t ret = do_set_pmd(vmf, page);
+ if (!ret) {
+ /* The page is mapped successfully, reference consumed. */
+ unlock_page(page);
+ return true;
}
- spin_unlock(vmf->ptl);
}
+ if (pmd_none(*vmf->pmd))
+ pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
+
/* See comment in handle_pte_fault() */
if (pmd_devmap_trans_unstable(vmf->pmd)) {
unlock_page(page);
@@ -3708,28 +3716,6 @@ out:
}
EXPORT_SYMBOL(generic_file_direct_write);
-/*
- * Find or create a page at the given pagecache position. Return the locked
- * page. This function is specifically for buffered writes.
- */
-struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index, unsigned flags)
-{
- struct page *page;
- int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
-
- if (flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
-
- page = pagecache_get_page(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping));
- if (page)
- wait_for_stable_page(page);
-
- return page;
-}
-EXPORT_SYMBOL(grab_cache_page_write_begin);
-
ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
@@ -3757,7 +3743,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
new file mode 100644
index 000000000000..5b6ae1da314e
--- /dev/null
+++ b/mm/folio-compat.c
@@ -0,0 +1,142 @@
+/*
+ * Compatibility functions which bloat the callers too much to make inline.
+ * All of the callers of these functions should be converted to use folios
+ * eventually.
+ */
+
+#include <linux/migrate.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+
+struct address_space *page_mapping(struct page *page)
+{
+ return folio_mapping(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapping);
+
+void unlock_page(struct page *page)
+{
+ return folio_unlock(page_folio(page));
+}
+EXPORT_SYMBOL(unlock_page);
+
+void end_page_writeback(struct page *page)
+{
+ return folio_end_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(end_page_writeback);
+
+void wait_on_page_writeback(struct page *page)
+{
+ return folio_wait_writeback(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+
+void wait_for_stable_page(struct page *page)
+{
+ return folio_wait_stable(page_folio(page));
+}
+EXPORT_SYMBOL_GPL(wait_for_stable_page);
+
+bool page_mapped(struct page *page)
+{
+ return folio_mapped(page_folio(page));
+}
+EXPORT_SYMBOL(page_mapped);
+
+void mark_page_accessed(struct page *page)
+{
+ folio_mark_accessed(page_folio(page));
+}
+EXPORT_SYMBOL(mark_page_accessed);
+
+#ifdef CONFIG_MIGRATION
+int migrate_page_move_mapping(struct address_space *mapping,
+ struct page *newpage, struct page *page, int extra_count)
+{
+ return folio_migrate_mapping(mapping, page_folio(newpage),
+ page_folio(page), extra_count);
+}
+EXPORT_SYMBOL(migrate_page_move_mapping);
+
+void migrate_page_states(struct page *newpage, struct page *page)
+{
+ folio_migrate_flags(page_folio(newpage), page_folio(page));
+}
+EXPORT_SYMBOL(migrate_page_states);
+
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+ folio_migrate_copy(page_folio(newpage), page_folio(page));
+}
+EXPORT_SYMBOL(migrate_page_copy);
+#endif
+
+bool set_page_writeback(struct page *page)
+{
+ return folio_start_writeback(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_writeback);
+
+bool set_page_dirty(struct page *page)
+{
+ return folio_mark_dirty(page_folio(page));
+}
+EXPORT_SYMBOL(set_page_dirty);
+
+int __set_page_dirty_nobuffers(struct page *page)
+{
+ return filemap_dirty_folio(page_mapping(page), page_folio(page));
+}
+EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+
+bool clear_page_dirty_for_io(struct page *page)
+{
+ return folio_clear_dirty_for_io(page_folio(page));
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
+bool redirty_page_for_writepage(struct writeback_control *wbc,
+ struct page *page)
+{
+ return folio_redirty_for_writepage(wbc, page_folio(page));
+}
+EXPORT_SYMBOL(redirty_page_for_writepage);
+
+void lru_cache_add(struct page *page)
+{
+ folio_add_lru(page_folio(page));
+}
+EXPORT_SYMBOL(lru_cache_add);
+
+int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
+ pgoff_t index, gfp_t gfp)
+{
+ return filemap_add_folio(mapping, page_folio(page), index, gfp);
+}
+EXPORT_SYMBOL(add_to_page_cache_lru);
+
+noinline
+struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
+ int fgp_flags, gfp_t gfp)
+{
+ struct folio *folio;
+
+ folio = __filemap_get_folio(mapping, index, fgp_flags, gfp);
+ if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio))
+ return &folio->page;
+ return folio_file_page(folio, index);
+}
+EXPORT_SYMBOL(pagecache_get_page);
+
+struct page *grab_cache_page_write_begin(struct address_space *mapping,
+ pgoff_t index, unsigned flags)
+{
+ unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
+
+ if (flags & AOP_FLAG_NOFS)
+ fgp_flags |= FGP_NOFS;
+ return pagecache_get_page(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+EXPORT_SYMBOL(grab_cache_page_write_begin);
diff --git a/mm/gup.c b/mm/gup.c
index 886d6148d3d0..2c51e9748a6a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -918,6 +918,8 @@ static int faultin_page(struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
+ if (*flags & FOLL_NOFAULT)
+ return -EFAULT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
@@ -1657,6 +1659,141 @@ finish_or_fault:
#endif /* !CONFIG_MMU */
/**
+ * fault_in_writeable - fault in userspace address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_writeable(char __user *uaddr, size_t size)
+{
+ char __user *start = uaddr, *end;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!PAGE_ALIGNED(uaddr)) {
+ if (unlikely(__put_user(0, uaddr) != 0))
+ return size;
+ uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ if (unlikely(__put_user(0, uaddr) != 0))
+ goto out;
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_writeable);
+
+/*
+ * fault_in_safe_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: length of address range
+ *
+ * Faults in an address range using get_user_pages, i.e., without triggering
+ * hardware page faults. This is primarily useful when we already know that
+ * some or all of the pages in the address range aren't in memory.
+ *
+ * Other than fault_in_writeable(), this function is non-destructive.
+ *
+ * Note that we don't pin or otherwise hold the pages referenced that we fault
+ * in. There's no guarantee that they'll stay in memory for any duration of
+ * time.
+ *
+ * Returns the number of bytes not faulted in, like copy_to_user() and
+ * copy_from_user().
+ */
+size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
+{
+ unsigned long start = (unsigned long)untagged_addr(uaddr);
+ unsigned long end, nstart, nend;
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
+ int locked = 0;
+
+ nstart = start & PAGE_MASK;
+ end = PAGE_ALIGN(start + size);
+ if (end < nstart)
+ end = 0;
+ for (; nstart != end; nstart = nend) {
+ unsigned long nr_pages;
+ long ret;
+
+ if (!locked) {
+ locked = 1;
+ mmap_read_lock(mm);
+ vma = find_vma(mm, nstart);
+ } else if (nstart >= vma->vm_end)
+ vma = vma->vm_next;
+ if (!vma || vma->vm_start >= end)
+ break;
+ nend = end ? min(end, vma->vm_end) : vma->vm_end;
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ continue;
+ if (nstart < vma->vm_start)
+ nstart = vma->vm_start;
+ nr_pages = (nend - nstart) / PAGE_SIZE;
+ ret = __get_user_pages_locked(mm, nstart, nr_pages,
+ NULL, NULL, &locked,
+ FOLL_TOUCH | FOLL_WRITE);
+ if (ret <= 0)
+ break;
+ nend = nstart + ret * PAGE_SIZE;
+ }
+ if (locked)
+ mmap_read_unlock(mm);
+ if (nstart == end)
+ return 0;
+ return size - min_t(size_t, nstart - start, size);
+}
+EXPORT_SYMBOL(fault_in_safe_writeable);
+
+/**
+ * fault_in_readable - fault in userspace address range for reading
+ * @uaddr: start of user address range
+ * @size: size of user address range
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_readable(const char __user *uaddr, size_t size)
+{
+ const char __user *start = uaddr, *end;
+ volatile char c;
+
+ if (unlikely(size == 0))
+ return 0;
+ if (!PAGE_ALIGNED(uaddr)) {
+ if (unlikely(__get_user(c, uaddr) != 0))
+ return size;
+ uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
+ }
+ end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
+ if (unlikely(end < start))
+ end = NULL;
+ while (uaddr != end) {
+ if (unlikely(__get_user(c, uaddr) != 0))
+ goto out;
+ uaddr += PAGE_SIZE;
+ }
+
+out:
+ (void)c;
+ if (size > uaddr - start)
+ return size - (uaddr - start);
+ return 0;
+}
+EXPORT_SYMBOL(fault_in_readable);
+
+/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
*
@@ -2228,7 +2365,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
- int ret = 1;
do {
struct page *page = pfn_to_page(pfn);
@@ -2236,14 +2372,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- ret = 0;
break;
}
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(!try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
- ret = 0;
break;
}
(*nr)++;
@@ -2251,7 +2385,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
} while (addr += PAGE_SIZE, addr != end);
put_dev_pagemap(pgmap);
- return ret;
+ return addr == end;
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
@@ -2708,7 +2842,7 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
diff --git a/mm/highmem.c b/mm/highmem.c
index 4212ad0e4a19..88f65f155845 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -23,7 +23,6 @@
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
-#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
@@ -383,7 +382,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
if (end1 > start1) {
- kaddr = kmap_atomic(page + i);
+ kaddr = kmap_local_page(page + i);
memset(kaddr + start1, 0, this_end - start1);
}
end1 -= this_end;
@@ -398,7 +397,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
if (end2 > start2) {
if (!kaddr)
- kaddr = kmap_atomic(page + i);
+ kaddr = kmap_local_page(page + i);
memset(kaddr + start2, 0, this_end - start2);
}
end2 -= this_end;
@@ -406,7 +405,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
}
if (kaddr) {
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
flush_dcache_page(page + i);
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 5e9ef0fc261e..e5483347291c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
@@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end)
{
- struct page *head = compound_head(page);
+ struct folio *folio = page_folio(page);
+ struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
@@ -2424,7 +2425,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
- lruvec = lock_page_lruvec(head);
+ lruvec = folio_lruvec_lock(folio);
+
+ ClearPageHasHWPoisoned(head);
for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
@@ -2700,12 +2703,14 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) {
int nr = thp_nr_pages(head);
- if (PageSwapBacked(head))
+ if (PageSwapBacked(head)) {
__mod_lruvec_page_state(head, NR_SHMEM_THPS,
-nr);
- else
+ } else {
__mod_lruvec_page_state(head, NR_FILE_THPS,
-nr);
+ filemap_nr_thps_dec(mapping);
+ }
}
__split_huge_page(page, list, end);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 95dc7b83381f..e09159c957e3 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -50,6 +50,17 @@ struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
+static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
+static bool hugetlb_cma_page(struct page *page, unsigned int order)
+{
+ return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page,
+ 1 << order);
+}
+#else
+static bool hugetlb_cma_page(struct page *page, unsigned int order)
+{
+ return false;
+}
#endif
static unsigned long hugetlb_cma_size __initdata;
@@ -66,6 +77,7 @@ static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
static bool __initdata parsed_valid_hugepagesz = true;
static bool __initdata parsed_default_hugepagesz;
+static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -321,8 +333,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
struct file_region *org)
{
#ifdef CONFIG_CGROUP_HUGETLB
- return rg && org &&
- rg->reservation_counter == org->reservation_counter &&
+ return rg->reservation_counter == org->reservation_counter &&
rg->css == org->css;
#else
@@ -435,7 +446,6 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
t, h, h_cg, regions_needed);
- VM_BUG_ON(add < 0);
return add;
}
@@ -1004,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
vma->vm_private_data = (void *)0;
}
+/*
+ * Reset and decrement one ref on hugepage private reservation.
+ * Called with mm->mmap_sem writer semaphore held.
+ * This function should be only used by move_vma() and operate on
+ * same sized vma. It should never come here with last ref on the
+ * reservation.
+ */
+void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+ /*
+ * Clear the old hugetlb private page reservation.
+ * It has already been transferred to new_vma.
+ *
+ * During a mremap() operation of a hugetlb vma we call move_vma()
+ * which copies vma into new_vma and unmaps vma. After the copy
+ * operation both new_vma and vma share a reference to the resv_map
+ * struct, and at that point vma is about to be unmapped. We don't
+ * want to return the reservation to the pool at unmap of vma because
+ * the reservation still lives on in new_vma, so simply decrement the
+ * ref here and remove the resv_map reference from this vma.
+ */
+ struct resv_map *reservations = vma_resv_map(vma);
+
+ if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ kref_put(&reservations->refs, resv_map_release);
+
+ reset_vma_resv_huge_pages(vma);
+}
+
/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
@@ -1260,9 +1299,9 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_page(struct page *page,
- unsigned int order)
+/* used to demote non-gigantic_huge pages as well */
+static void __destroy_compound_gigantic_page(struct page *page,
+ unsigned int order, bool demote)
{
int i;
int nr_pages = 1 << order;
@@ -1272,8 +1311,10 @@ static void destroy_compound_gigantic_page(struct page *page,
atomic_set(compound_pincount_ptr(page), 0);
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+ p->mapping = NULL;
clear_compound_head(p);
- set_page_refcounted(p);
+ if (!demote)
+ set_page_refcounted(p);
}
set_compound_order(page, 0);
@@ -1281,6 +1322,19 @@ static void destroy_compound_gigantic_page(struct page *page,
__ClearPageHead(page);
}
+static void destroy_compound_hugetlb_page_for_demote(struct page *page,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_page(page, order, true);
+}
+
+#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
+static void destroy_compound_gigantic_page(struct page *page,
+ unsigned int order)
+{
+ __destroy_compound_gigantic_page(page, order, false);
+}
+
static void free_gigantic_page(struct page *page, unsigned int order)
{
/*
@@ -1353,12 +1407,15 @@ static inline void destroy_compound_gigantic_page(struct page *page,
/*
* Remove hugetlb page from lists, and update dtor so that page appears
- * as just a compound page. A reference is held on the page.
+ * as just a compound page.
+ *
+ * A reference is held on the page, except in the case of demote.
*
* Must be called with hugetlb lock held.
*/
-static void remove_hugetlb_page(struct hstate *h, struct page *page,
- bool adjust_surplus)
+static void __remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus,
+ bool demote)
{
int nid = page_to_nid(page);
@@ -1396,8 +1453,12 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
*
* This handles the case where more than one ref is held when and
* after update_and_free_page is called.
+ *
+ * In the case of demote we do not ref count the page as it will soon
+ * be turned into a page of smaller size.
*/
- set_page_refcounted(page);
+ if (!demote)
+ set_page_refcounted(page);
if (hstate_is_gigantic(h))
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
else
@@ -1407,6 +1468,18 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
h->nr_huge_pages_node[nid]--;
}
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_page(h, page, adjust_surplus, false);
+}
+
+static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page,
+ bool adjust_surplus)
+{
+ __remove_hugetlb_page(h, page, adjust_surplus, true);
+}
+
static void add_hugetlb_page(struct hstate *h, struct page *page,
bool adjust_surplus)
{
@@ -1476,7 +1549,13 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
}
- if (hstate_is_gigantic(h)) {
+
+ /*
+ * Non-gigantic pages demoted from CMA allocated gigantic pages
+ * need to be given back to CMA in free_gigantic_page.
+ */
+ if (hstate_is_gigantic(h) ||
+ hugetlb_cma_page(page, huge_page_order(h))) {
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
@@ -1664,7 +1743,8 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
spin_unlock_irq(&hugetlb_lock);
}
-static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
+ bool demote)
{
int i, j;
int nr_pages = 1 << order;
@@ -1702,12 +1782,17 @@ static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
* the set of pages can not be converted to a gigantic page.
* The caller who allocated the pages should then discard the
* pages using the appropriate free interface.
+ *
+ * In the case of demote, the ref count will be zero.
*/
- if (!page_ref_freeze(p, 1)) {
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
- goto out_error;
+ if (!demote) {
+ if (!page_ref_freeze(p, 1)) {
+ pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+ goto out_error;
+ }
+ } else {
+ VM_BUG_ON_PAGE(page_count(p), p);
}
- set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
@@ -1730,6 +1815,17 @@ out_error:
return false;
}
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
+{
+ return __prep_compound_gigantic_page(page, order, false);
+}
+
+static bool prep_compound_gigantic_page_for_demote(struct page *page,
+ unsigned int order)
+{
+ return __prep_compound_gigantic_page(page, order, true);
+}
+
/*
* PageHuge() only returns true for hugetlbfs pages, but not for normal or
* transparent huge pages. See the PageTransHuge() documentation for more
@@ -2868,33 +2964,39 @@ out_subpool_put:
return ERR_PTR(-ENOSPC);
}
-int alloc_bootmem_huge_page(struct hstate *h)
+int alloc_bootmem_huge_page(struct hstate *h, int nid)
__attribute__ ((weak, alias("__alloc_bootmem_huge_page")));
-int __alloc_bootmem_huge_page(struct hstate *h)
+int __alloc_bootmem_huge_page(struct hstate *h, int nid)
{
- struct huge_bootmem_page *m;
+ struct huge_bootmem_page *m = NULL; /* initialize for clang */
int nr_nodes, node;
+ if (nid >= nr_online_nodes)
+ return 0;
+ /* do node specific alloc */
+ if (nid != NUMA_NO_NODE) {
+ m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!m)
+ return 0;
+ goto found;
+ }
+ /* allocate from next node when distributing huge pages */
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
- void *addr;
-
- addr = memblock_alloc_try_nid_raw(
+ m = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
- if (addr) {
- /*
- * Use the beginning of the huge page to store the
- * huge_bootmem_page struct (until gather_bootmem
- * puts them into the mem_map).
- */
- m = addr;
- goto found;
- }
+ /*
+ * Use the beginning of the huge page to store the
+ * huge_bootmem_page struct (until gather_bootmem
+ * puts them into the mem_map).
+ */
+ if (!m)
+ return 0;
+ goto found;
}
- return 0;
found:
- BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h)));
/* Put them into a private list first because mem_map is not up yet */
INIT_LIST_HEAD(&m->list);
list_add(&m->list, &huge_boot_pages);
@@ -2934,12 +3036,61 @@ static void __init gather_bootmem_prealloc(void)
cond_resched();
}
}
+static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
+{
+ unsigned long i;
+ char buf[32];
+
+ for (i = 0; i < h->max_huge_pages_node[nid]; ++i) {
+ if (hstate_is_gigantic(h)) {
+ if (!alloc_bootmem_huge_page(h, nid))
+ break;
+ } else {
+ struct page *page;
+ gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+
+ page = alloc_fresh_huge_page(h, gfp_mask, nid,
+ &node_states[N_MEMORY], NULL);
+ if (!page)
+ break;
+ put_page(page); /* free it into the hugepage allocator */
+ }
+ cond_resched();
+ }
+ if (i == h->max_huge_pages_node[nid])
+ return;
+
+ string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n",
+ h->max_huge_pages_node[nid], buf, nid, i);
+ h->max_huge_pages -= (h->max_huge_pages_node[nid] - i);
+ h->max_huge_pages_node[nid] = i;
+}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
nodemask_t *node_alloc_noretry;
+ bool node_specific_alloc = false;
+
+ /* skip gigantic hugepages allocation if hugetlb_cma enabled */
+ if (hstate_is_gigantic(h) && hugetlb_cma_size) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ return;
+ }
+
+ /* do node specific alloc */
+ for (i = 0; i < nr_online_nodes; i++) {
+ if (h->max_huge_pages_node[i] > 0) {
+ hugetlb_hstate_alloc_pages_onenode(h, i);
+ node_specific_alloc = true;
+ }
+ }
+ if (node_specific_alloc)
+ return;
+
+ /* below will do all node balanced alloc */
if (!hstate_is_gigantic(h)) {
/*
* Bit mask controlling how hard we retry per-node allocations.
@@ -2960,11 +3111,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
- if (hugetlb_cma_size) {
- pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
- goto free;
- }
- if (!alloc_bootmem_huge_page(h))
+ if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE))
break;
} else if (!alloc_pool_huge_page(h,
&node_states[N_MEMORY],
@@ -2980,13 +3127,12 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-free:
kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
{
- struct hstate *h;
+ struct hstate *h, *h2;
for_each_hstate(h) {
if (minimum_order > huge_page_order(h))
@@ -2995,6 +3141,26 @@ static void __init hugetlb_init_hstates(void)
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
+
+ /*
+ * Set demote order for each hstate. Note that
+ * h->demote_order is initially 0.
+ * - We can not demote gigantic pages if runtime freeing
+ * is not supported, so skip this.
+ * - If CMA allocation is possible, we can not demote
+ * HUGETLB_PAGE_ORDER or smaller size pages.
+ */
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ continue;
+ if (hugetlb_cma_size && h->order <= HUGETLB_PAGE_ORDER)
+ continue;
+ for_each_hstate(h2) {
+ if (h2 == h)
+ continue;
+ if (h2->order < h->order &&
+ h2->order > h->demote_order)
+ h->demote_order = h2->order;
+ }
}
VM_BUG_ON(minimum_order == UINT_MAX);
}
@@ -3235,9 +3401,100 @@ out:
return 0;
}
+static int demote_free_huge_page(struct hstate *h, struct page *page)
+{
+ int i, nid = page_to_nid(page);
+ struct hstate *target_hstate;
+ int rc = 0;
+
+ target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
+
+ remove_hugetlb_page_for_demote(h, page, false);
+ spin_unlock_irq(&hugetlb_lock);
+
+ rc = alloc_huge_page_vmemmap(h, page);
+ if (rc) {
+ /* Allocation of vmemmmap failed, we can not demote page */
+ spin_lock_irq(&hugetlb_lock);
+ set_page_refcounted(page);
+ add_hugetlb_page(h, page, false);
+ return rc;
+ }
+
+ /*
+ * Use destroy_compound_hugetlb_page_for_demote for all huge page
+ * sizes as it will not ref count pages.
+ */
+ destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h));
+
+ /*
+ * Taking target hstate mutex synchronizes with set_max_huge_pages.
+ * Without the mutex, pages added to target hstate could be marked
+ * as surplus.
+ *
+ * Note that we already hold h->resize_lock. To prevent deadlock,
+ * use the convention of always taking larger size hstate mutex first.
+ */
+ mutex_lock(&target_hstate->resize_lock);
+ for (i = 0; i < pages_per_huge_page(h);
+ i += pages_per_huge_page(target_hstate)) {
+ if (hstate_is_gigantic(target_hstate))
+ prep_compound_gigantic_page_for_demote(page + i,
+ target_hstate->order);
+ else
+ prep_compound_page(page + i, target_hstate->order);
+ set_page_private(page + i, 0);
+ set_page_refcounted(page + i);
+ prep_new_huge_page(target_hstate, page + i, nid);
+ put_page(page + i);
+ }
+ mutex_unlock(&target_hstate->resize_lock);
+
+ spin_lock_irq(&hugetlb_lock);
+
+ /*
+ * Not absolutely necessary, but for consistency update max_huge_pages
+ * based on pool changes for the demoted page.
+ */
+ h->max_huge_pages--;
+ target_hstate->max_huge_pages += pages_per_huge_page(h);
+
+ return rc;
+}
+
+static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+ __must_hold(&hugetlb_lock)
+{
+ int nr_nodes, node;
+ struct page *page;
+ int rc = 0;
+
+ lockdep_assert_held(&hugetlb_lock);
+
+ /* We should never get here if no demote order */
+ if (!h->demote_order) {
+ pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
+ return -EINVAL; /* internal error */
+ }
+
+ for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+ if (!list_empty(&h->hugepage_freelists[node])) {
+ page = list_entry(h->hugepage_freelists[node].next,
+ struct page, lru);
+ rc = demote_free_huge_page(h, page);
+ break;
+ }
+ }
+
+ return rc;
+}
+
#define HSTATE_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR_WO(_name) \
+ static struct kobj_attribute _name##_attr = __ATTR_WO(_name)
+
#define HSTATE_ATTR(_name) \
static struct kobj_attribute _name##_attr = \
__ATTR(_name, 0644, _name##_show, _name##_store)
@@ -3433,6 +3690,103 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
}
HSTATE_ATTR_RO(surplus_hugepages);
+static ssize_t demote_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t len)
+{
+ unsigned long nr_demote;
+ unsigned long nr_available;
+ nodemask_t nodes_allowed, *n_mask;
+ struct hstate *h;
+ int err = 0;
+ int nid;
+
+ err = kstrtoul(buf, 10, &nr_demote);
+ if (err)
+ return err;
+ h = kobj_to_hstate(kobj, &nid);
+
+ if (nid != NUMA_NO_NODE) {
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ } else {
+ n_mask = &node_states[N_MEMORY];
+ }
+
+ /* Synchronize with other sysfs operations modifying huge pages */
+ mutex_lock(&h->resize_lock);
+ spin_lock_irq(&hugetlb_lock);
+
+ while (nr_demote) {
+ /*
+ * Check for available pages to demote each time thorough the
+ * loop as demote_pool_huge_page will drop hugetlb_lock.
+ */
+ if (nid != NUMA_NO_NODE)
+ nr_available = h->free_huge_pages_node[nid];
+ else
+ nr_available = h->free_huge_pages;
+ nr_available -= h->resv_huge_pages;
+ if (!nr_available)
+ break;
+
+ err = demote_pool_huge_page(h, n_mask);
+ if (err)
+ break;
+
+ nr_demote--;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+ mutex_unlock(&h->resize_lock);
+
+ if (err)
+ return err;
+ return len;
+}
+HSTATE_ATTR_WO(demote);
+
+static ssize_t demote_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ int nid;
+ struct hstate *h = kobj_to_hstate(kobj, &nid);
+ unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
+
+ return sysfs_emit(buf, "%lukB\n", demote_size);
+}
+
+static ssize_t demote_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hstate *h, *demote_hstate;
+ unsigned long demote_size;
+ unsigned int demote_order;
+ int nid;
+
+ demote_size = (unsigned long)memparse(buf, NULL);
+
+ demote_hstate = size_to_hstate(demote_size);
+ if (!demote_hstate)
+ return -EINVAL;
+ demote_order = demote_hstate->order;
+ if (demote_order < HUGETLB_PAGE_ORDER)
+ return -EINVAL;
+
+ /* demote order must be smaller than hstate order */
+ h = kobj_to_hstate(kobj, &nid);
+ if (demote_order >= h->order)
+ return -EINVAL;
+
+ /* resize_lock synchronizes access to demote size and writes */
+ mutex_lock(&h->resize_lock);
+ h->demote_order = demote_order;
+ mutex_unlock(&h->resize_lock);
+
+ return count;
+}
+HSTATE_ATTR(demote_size);
+
static struct attribute *hstate_attrs[] = {
&nr_hugepages_attr.attr,
&nr_overcommit_hugepages_attr.attr,
@@ -3449,6 +3803,16 @@ static const struct attribute_group hstate_attr_group = {
.attrs = hstate_attrs,
};
+static struct attribute *hstate_demote_attrs[] = {
+ &demote_size_attr.attr,
+ &demote_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group hstate_demote_attr_group = {
+ .attrs = hstate_demote_attrs,
+};
+
static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
struct kobject **hstate_kobjs,
const struct attribute_group *hstate_attr_group)
@@ -3466,6 +3830,12 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
hstate_kobjs[hi] = NULL;
}
+ if (h->demote_order) {
+ if (sysfs_create_group(hstate_kobjs[hi],
+ &hstate_demote_attr_group))
+ pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+ }
+
return retval;
}
@@ -3671,6 +4041,10 @@ static int __init hugetlb_init(void)
}
default_hstate.max_huge_pages =
default_hstate_max_huge_pages;
+
+ for (i = 0; i < nr_online_nodes; i++)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
}
}
@@ -3731,6 +4105,10 @@ void __init hugetlb_add_hstate(unsigned int order)
parsed_hstate = h;
}
+bool __init __weak hugetlb_node_alloc_supported(void)
+{
+ return true;
+}
/*
* hugepages command line processing
* hugepages normally follows a valid hugepagsz or default_hugepagsz
@@ -3742,6 +4120,10 @@ static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
+ int node = NUMA_NO_NODE;
+ int count;
+ unsigned long tmp;
+ char *p = s;
if (!parsed_valid_hugepagesz) {
pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
@@ -3765,8 +4147,40 @@ static int __init hugepages_setup(char *s)
return 0;
}
- if (sscanf(s, "%lu", mhp) <= 0)
- *mhp = 0;
+ while (*p) {
+ count = 0;
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ /* Parameter is node format */
+ if (p[count] == ':') {
+ if (!hugetlb_node_alloc_supported()) {
+ pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n");
+ return 0;
+ }
+ node = tmp;
+ p += count + 1;
+ if (node < 0 || node >= nr_online_nodes)
+ goto invalid;
+ /* Parse hugepages */
+ if (sscanf(p, "%lu%n", &tmp, &count) != 1)
+ goto invalid;
+ if (!hugetlb_max_hstate)
+ default_hugepages_in_node[node] = tmp;
+ else
+ parsed_hstate->max_huge_pages_node[node] = tmp;
+ *mhp += tmp;
+ /* Go to parse next node*/
+ if (p[count] == ',')
+ p += count + 1;
+ else
+ break;
+ } else {
+ if (p != s)
+ goto invalid;
+ *mhp = tmp;
+ break;
+ }
+ }
/*
* Global state is always initialized later in hugetlb_init.
@@ -3779,6 +4193,10 @@ static int __init hugepages_setup(char *s)
last_mhp = mhp;
return 1;
+
+invalid:
+ pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p);
+ return 0;
}
__setup("hugepages=", hugepages_setup);
@@ -3840,6 +4258,7 @@ __setup("hugepagesz=", hugepagesz_setup);
static int __init default_hugepagesz_setup(char *s)
{
unsigned long size;
+ int i;
parsed_valid_hugepagesz = false;
if (parsed_default_hugepagesz) {
@@ -3868,6 +4287,9 @@ static int __init default_hugepagesz_setup(char *s)
*/
if (default_hstate_max_huge_pages) {
default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ for (i = 0; i < nr_online_nodes; i++)
+ default_hstate.max_huge_pages_node[i] =
+ default_hugepages_in_node[i];
if (hstate_is_gigantic(&default_hstate))
hugetlb_hstate_alloc_pages(&default_hstate);
default_hstate_max_huge_pages = 0;
@@ -4426,9 +4848,85 @@ again:
return ret;
}
-void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- struct page *ref_page)
+static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pte_t *src_pte)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct mm_struct *mm = vma->vm_mm;
+ pte_t *dst_pte, pte;
+ spinlock_t *src_ptl, *dst_ptl;
+
+ dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
+ dst_ptl = huge_pte_lock(h, mm, dst_pte);
+ src_ptl = huge_pte_lockptr(h, mm, src_pte);
+
+ /*
+ * We don't have to worry about the ordering of src and dst ptlocks
+ * because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
+ */
+ if (src_ptl != dst_ptl)
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+
+ pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
+ set_huge_pte_at(mm, new_addr, dst_pte, pte);
+
+ if (src_ptl != dst_ptl)
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+}
+
+int move_hugetlb_page_tables(struct vm_area_struct *vma,
+ struct vm_area_struct *new_vma,
+ unsigned long old_addr, unsigned long new_addr,
+ unsigned long len)
+{
+ struct hstate *h = hstate_vma(vma);
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ unsigned long sz = huge_page_size(h);
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_end = old_addr + len;
+ unsigned long old_addr_copy;
+ pte_t *src_pte, *dst_pte;
+ struct mmu_notifier_range range;
+
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
+ old_end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
+ /* Prevent race with file truncation */
+ i_mmap_lock_write(mapping);
+ for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
+ src_pte = huge_pte_offset(mm, old_addr, sz);
+ if (!src_pte)
+ continue;
+ if (huge_pte_none(huge_ptep_get(src_pte)))
+ continue;
+
+ /* old_addr arg to huge_pmd_unshare() is a pointer and so the
+ * arg may be modified. Pass a copy instead to preserve the
+ * value in old_addr.
+ */
+ old_addr_copy = old_addr;
+
+ if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
+ continue;
+
+ dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
+ if (!dst_pte)
+ break;
+
+ move_huge_pte(vma, old_addr, new_addr, src_pte);
+ }
+ i_mmap_unlock_write(mapping);
+ flush_tlb_range(vma, old_end - len, old_end);
+ mmu_notifier_invalidate_range_end(&range);
+
+ return len + old_addr - old_end;
+}
+
+static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct page *ref_page)
{
struct mm_struct *mm = vma->vm_mm;
unsigned long address;
@@ -4616,7 +5114,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
- * Called with hugetlb_instantiation_mutex held and pte_page locked so we
+ * Called with hugetlb_fault_mutex_table held and pte_page locked so we
* cannot race with other handlers or page migration.
* Keep the pte_same checks anyway to make transition from the mutex easier.
*/
@@ -5302,7 +5800,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
goto out;
}
- copy_huge_page(page, *pagep);
+ folio_copy(page_folio(page), page_folio(*pagep));
put_page(*pagep);
*pagep = NULL;
}
@@ -5965,12 +6463,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* sharing is possible. For hugetlbfs, this prevents removal of any page
* table entries associated with the address space. This is important as we
* are setting up sharing based on existing page table entries (mappings).
- *
- * NOTE: This routine is only called from huge_pte_alloc. Some callers of
- * huge_pte_alloc know that sharing is not possible and do not take
- * i_mmap_rwsem as a performance optimization. This is handled by the
- * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
- * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
@@ -6371,7 +6863,38 @@ static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
{
- hugetlb_cma_size = memparse(p, &p);
+ int nid, count = 0;
+ unsigned long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%lu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ nid = tmp;
+ if (nid < 0 || nid >= MAX_NUMNODES)
+ break;
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ hugetlb_cma_size_in_node[nid] = tmp;
+ hugetlb_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ hugetlb_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
return 0;
}
@@ -6380,6 +6903,7 @@ early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
void __init hugetlb_cma_reserve(int order)
{
unsigned long size, reserved, per_node;
+ bool node_specific_cma_alloc = false;
int nid;
cma_reserve_called = true;
@@ -6387,30 +6911,72 @@ void __init hugetlb_cma_reserve(int order)
if (!hugetlb_cma_size)
return;
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_state(nid, N_ONLINE)) {
+ pr_warn("hugetlb_cma: invalid node %d specified\n", nid);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (hugetlb_cma_size_in_node[nid] < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area of node %d should be at least %lu MiB\n",
+ nid, (PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size -= hugetlb_cma_size_in_node[nid];
+ hugetlb_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!hugetlb_cma_size)
+ return;
+
if (hugetlb_cma_size < (PAGE_SIZE << order)) {
pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
(PAGE_SIZE << order) / SZ_1M);
+ hugetlb_cma_size = 0;
return;
}
- /*
- * If 3 GB area is requested on a machine with 4 numa nodes,
- * let's allocate 1 GB on first three nodes and ignore the last one.
- */
- per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
- pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
- hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+ }
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
char name[CMA_MAX_NAME];
- size = min(per_node, hugetlb_cma_size - reserved);
+ if (node_specific_cma_alloc) {
+ if (hugetlb_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = hugetlb_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, hugetlb_cma_size - reserved);
+ }
+
size = round_up(size, PAGE_SIZE << order);
snprintf(name, sizeof(name), "hugetlb%d", nid);
- res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_nid(0, size, 0,
+ PAGE_SIZE << HUGETLB_PAGE_ORDER,
0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
@@ -6426,6 +6992,13 @@ void __init hugetlb_cma_reserve(int order)
if (reserved >= hugetlb_cma_size)
break;
}
+
+ if (!reserved)
+ /*
+ * hugetlb_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ hugetlb_cma_size = 0;
}
void __init hugetlb_cma_check(void)
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 5383023d0cca..79d93534ef1e 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -27,9 +27,6 @@
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
-#define hugetlb_cgroup_from_counter(counter, idx) \
- container_of(counter, struct hugetlb_cgroup, hugepage[idx])
-
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline struct page_counter *
diff --git a/mm/internal.h b/mm/internal.h
index cf3cb933eba3..3b79a5c9427a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,10 +34,40 @@
void page_writeback_init(void);
+static inline void *folio_raw_mapping(struct folio *folio)
+{
+ unsigned long mapping = (unsigned long)folio->mapping;
+
+ return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
+}
+
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+ int nr_throttled);
+static inline void acct_reclaim_writeback(struct folio *folio)
+{
+ pg_data_t *pgdat = folio_pgdat(folio);
+ int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
+
+ if (nr_throttled)
+ __acct_reclaim_writeback(pgdat, folio, nr_throttled);
+}
+
+static inline void wake_throttle_isolated(pg_data_t *pgdat)
+{
+ wait_queue_head_t *wqh;
+
+ wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+}
+
vm_fault_t do_swap_page(struct vm_fault *vmf);
+void folio_rotate_reclaimable(struct folio *folio);
+bool __folio_end_writeback(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
@@ -63,17 +93,28 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
/**
- * page_evictable - test whether a page is evictable
- * @page: the page to test
- *
- * Test whether page is evictable--i.e., should be placed on active/inactive
- * lists vs unevictable list.
+ * folio_evictable - Test whether a folio is evictable.
+ * @folio: The folio to test.
*
- * Reasons page might not be evictable:
- * (1) page's mapping marked unevictable
- * (2) page is part of an mlocked VMA
+ * Test whether @folio is evictable -- i.e., should be placed on
+ * active/inactive lists vs unevictable list.
*
+ * Reasons folio might not be evictable:
+ * 1. folio's mapping marked unevictable
+ * 2. One of the pages in the folio is part of an mlocked VMA
*/
+static inline bool folio_evictable(struct folio *folio)
+{
+ bool ret;
+
+ /* Prevent address_space of inode and swap cache from being freed */
+ rcu_read_lock();
+ ret = !mapping_unevictable(folio_mapping(folio)) &&
+ !folio_test_mlocked(folio);
+ rcu_read_unlock();
+ return ret;
+}
+
static inline bool page_evictable(struct page *page)
{
bool ret;
@@ -109,6 +150,7 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
+extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
/*
* in mm/rmap.c:
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2baf121fb8c5..8428da2aaf17 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -30,20 +30,20 @@
#include "kasan.h"
#include "../slab.h"
-depot_stack_handle_t kasan_save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
nr_entries = filter_irq_stacks(entries, nr_entries);
- return stack_depot_save(entries, nr_entries, flags);
+ return __stack_depot_save(entries, nr_entries, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
- track->stack = kasan_save_stack(flags);
+ track->stack = kasan_save_stack(flags, true);
}
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
@@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache,
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
/* For SLAB assign tags based on the object index in the freelist. */
- return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
+ return (u8)obj_to_index(cache, virt_to_head_page(object), (void *)object);
#else
/*
* For SLUB assign a random tag during slab creation, otherwise reuse
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index c3f5ba7a294a..84a038b07c6f 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -328,7 +328,7 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
-void kasan_record_aux_stack(void *addr)
+static void __kasan_record_aux_stack(void *addr, bool can_alloc)
{
struct page *page = kasan_addr_to_page(addr);
struct kmem_cache *cache;
@@ -345,7 +345,17 @@ void kasan_record_aux_stack(void *addr)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
- alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+ alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
+}
+
+void kasan_record_aux_stack(void *addr)
+{
+ return __kasan_record_aux_stack(addr, true);
+}
+
+void kasan_record_aux_stack_noalloc(void *addr)
+{
+ return __kasan_record_aux_stack(addr, false);
}
void kasan_set_free_info(struct kmem_cache *cache,
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 05d1e9460e2e..dc892119e88f 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -29,6 +29,7 @@ enum kasan_arg_mode {
KASAN_ARG_MODE_DEFAULT,
KASAN_ARG_MODE_SYNC,
KASAN_ARG_MODE_ASYNC,
+ KASAN_ARG_MODE_ASYMM,
};
enum kasan_arg_stacktrace {
@@ -45,9 +46,9 @@ static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
-/* Whether the asynchronous mode is enabled. */
-bool kasan_flag_async __ro_after_init;
-EXPORT_SYMBOL_GPL(kasan_flag_async);
+/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
+enum kasan_mode kasan_mode __ro_after_init;
+EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
@@ -69,7 +70,7 @@ static int __init early_kasan_flag(char *arg)
}
early_param("kasan", early_kasan_flag);
-/* kasan.mode=sync/async */
+/* kasan.mode=sync/async/asymm */
static int __init early_kasan_mode(char *arg)
{
if (!arg)
@@ -79,6 +80,8 @@ static int __init early_kasan_mode(char *arg)
kasan_arg_mode = KASAN_ARG_MODE_SYNC;
else if (!strcmp(arg, "async"))
kasan_arg_mode = KASAN_ARG_MODE_ASYNC;
+ else if (!strcmp(arg, "asymm"))
+ kasan_arg_mode = KASAN_ARG_MODE_ASYMM;
else
return -EINVAL;
@@ -116,11 +119,13 @@ void kasan_init_hw_tags_cpu(void)
return;
/*
- * Enable async mode only when explicitly requested through
- * the command line.
+ * Enable async or asymm modes only when explicitly requested
+ * through the command line.
*/
if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
hw_enable_tagging_async();
+ else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
+ hw_enable_tagging_asymm();
else
hw_enable_tagging_sync();
}
@@ -143,15 +148,19 @@ void __init kasan_init_hw_tags(void)
case KASAN_ARG_MODE_DEFAULT:
/*
* Default to sync mode.
- * Do nothing, kasan_flag_async keeps its default value.
*/
- break;
+ fallthrough;
case KASAN_ARG_MODE_SYNC:
- /* Do nothing, kasan_flag_async keeps its default value. */
+ /* Sync mode enabled. */
+ kasan_mode = KASAN_MODE_SYNC;
break;
case KASAN_ARG_MODE_ASYNC:
/* Async mode enabled. */
- kasan_flag_async = true;
+ kasan_mode = KASAN_MODE_ASYNC;
+ break;
+ case KASAN_ARG_MODE_ASYMM:
+ /* Asymm mode enabled. */
+ kasan_mode = KASAN_MODE_ASYMM;
break;
}
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 8bf568a80eb8..aebd8df86a1f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -13,16 +13,28 @@
#include "../slab.h"
DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
-extern bool kasan_flag_async __ro_after_init;
+
+enum kasan_mode {
+ KASAN_MODE_SYNC,
+ KASAN_MODE_ASYNC,
+ KASAN_MODE_ASYMM,
+};
+
+extern enum kasan_mode kasan_mode __ro_after_init;
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
}
-static inline bool kasan_async_mode_enabled(void)
+static inline bool kasan_async_fault_possible(void)
+{
+ return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM;
+}
+
+static inline bool kasan_sync_fault_possible(void)
{
- return kasan_flag_async;
+ return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
#else
@@ -31,14 +43,17 @@ static inline bool kasan_stack_collection_enabled(void)
return true;
}
-static inline bool kasan_async_mode_enabled(void)
+static inline bool kasan_async_fault_possible(void)
{
return false;
}
-#endif
+static inline bool kasan_sync_fault_possible(void)
+{
+ return true;
+}
-extern bool kasan_flag_async __ro_after_init;
+#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
@@ -251,7 +266,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
-depot_stack_handle_t kasan_save_stack(gfp_t flags);
+depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
void kasan_set_track(struct kasan_track *track, gfp_t flags);
void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
@@ -289,6 +304,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_enable_tagging_async
#define arch_enable_tagging_async()
#endif
+#ifndef arch_enable_tagging_asymm
+#define arch_enable_tagging_asymm()
+#endif
#ifndef arch_force_async_tag_fault
#define arch_force_async_tag_fault()
#endif
@@ -304,6 +322,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging_sync() arch_enable_tagging_sync()
#define hw_enable_tagging_async() arch_enable_tagging_async()
+#define hw_enable_tagging_asymm() arch_enable_tagging_asymm()
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
@@ -314,6 +333,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging_sync()
#define hw_enable_tagging_async()
+#define hw_enable_tagging_asymm()
#endif /* CONFIG_KASAN_HW_TAGS */
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 884a950c7026..0bc10f452f7e 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -112,7 +112,7 @@ static void start_report(unsigned long *flags)
static void end_report(unsigned long *flags, unsigned long addr)
{
- if (!kasan_async_mode_enabled())
+ if (!kasan_async_fault_possible())
trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -132,20 +132,11 @@ static void end_report(unsigned long *flags, unsigned long addr)
kasan_enable_current();
}
-static void print_stack(depot_stack_handle_t stack)
-{
- unsigned long *entries;
- unsigned int nr_entries;
-
- nr_entries = stack_depot_fetch(stack, &entries);
- stack_trace_print(entries, nr_entries, 0);
-}
-
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- print_stack(track->stack);
+ stack_depot_print(track->stack);
} else {
pr_err("(stack is not available)\n");
}
@@ -214,12 +205,12 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object,
return;
if (alloc_meta->aux_stack[0]) {
pr_err("Last potentially related work creation:\n");
- print_stack(alloc_meta->aux_stack[0]);
+ stack_depot_print(alloc_meta->aux_stack[0]);
pr_err("\n");
}
if (alloc_meta->aux_stack[1]) {
pr_err("Second to last potentially related work creation:\n");
- print_stack(alloc_meta->aux_stack[1]);
+ stack_depot_print(alloc_meta->aux_stack[1]);
pr_err("\n");
}
#endif
@@ -235,7 +226,7 @@ static void describe_object(struct kmem_cache *cache, void *object,
static inline bool kernel_or_module_addr(const void *addr)
{
- if (addr >= (void *)_stext && addr < (void *)_end)
+ if (is_kernel((unsigned long)addr))
return true;
if (is_module_address((unsigned long)addr))
return true;
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index 8d95ee52d019..4a4929b29a23 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init);
#ifdef CONFIG_KASAN_VMALLOC
+void __init __weak kasan_populate_early_vm_area_shadow(void *start,
+ unsigned long size)
+{
+}
+
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7a97db8bc8e7..09945784df9e 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -10,12 +10,15 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
+#include <linux/hash.h>
#include <linux/irq_work.h>
+#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
+#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
@@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __ro_after_init;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -97,14 +104,41 @@ struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
-#ifdef CONFIG_KFENCE_STATIC_KEYS
-/* The static key to set up a KFENCE allocation. */
+/*
+ * The static key to set up a KFENCE allocation; or if static keys are not used
+ * to gate allocations, to avoid a load and compare if KFENCE is disabled.
+ */
DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
-#endif
/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ * P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM 2
+#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
+
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
/* Statistics counters for debugfs. */
enum kfence_counter_id {
KFENCE_COUNTER_ALLOCATED,
@@ -112,6 +146,9 @@ enum kfence_counter_id {
KFENCE_COUNTER_FREES,
KFENCE_COUNTER_ZOMBIES,
KFENCE_COUNTER_BUGS,
+ KFENCE_COUNTER_SKIP_INCOMPAT,
+ KFENCE_COUNTER_SKIP_CAPACITY,
+ KFENCE_COUNTER_SKIP_COVERED,
KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,11 +158,59 @@ static const char *const counter_names[] = {
[KFENCE_COUNTER_FREES] = "total frees",
[KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
[KFENCE_COUNTER_BUGS] = "total bugs",
+ [KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
+ [KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
+ [KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
/* === Internals ============================================================ */
+static inline bool should_skip_covered(void)
+{
+ unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+ return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+ num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+ num_entries = filter_irq_stacks(stack_entries, num_entries);
+ return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+ int i;
+
+ for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+ if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+ return false;
+ alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+ }
+
+ return true;
+}
+
static bool kfence_protect(unsigned long addr)
{
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -183,19 +268,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
* Update the object's metadata state, including updating the alloc/free stacks
* depending on the state transition.
*/
-static noinline void metadata_update_state(struct kfence_metadata *meta,
- enum kfence_object_state next)
+static noinline void
+metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
+ unsigned long *stack_entries, size_t num_stack_entries)
{
struct kfence_track *track =
next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
lockdep_assert_held(&meta->lock);
- /*
- * Skip over 1 (this) functions; noinline ensures we do not accidentally
- * skip over the caller by never inlining.
- */
- track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ if (stack_entries) {
+ memcpy(track->stack_entries, stack_entries,
+ num_stack_entries * sizeof(stack_entries[0]));
+ } else {
+ /*
+ * Skip over 1 (this) functions; noinline ensures we do not
+ * accidentally skip over the caller by never inlining.
+ */
+ num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ }
+ track->num_stack_entries = num_stack_entries;
track->pid = task_pid_nr(current);
track->cpu = raw_smp_processor_id();
track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
@@ -218,12 +310,19 @@ static inline bool set_canary_byte(u8 *addr)
/* Check canary byte at @addr. */
static inline bool check_canary_byte(u8 *addr)
{
+ struct kfence_metadata *meta;
+ unsigned long flags;
+
if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
return true;
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
- kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
- KFENCE_ERROR_CORRUPTION);
+
+ meta = addr_to_metadata((unsigned long)addr);
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
return false;
}
@@ -233,8 +332,6 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
unsigned long addr;
- lockdep_assert_held(&meta->lock);
-
/*
* We'll iterate over each canary byte per-side until fn() returns
* false. However, we'll still iterate over the canary bytes to the
@@ -257,7 +354,9 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
}
}
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
+ unsigned long *stack_entries, size_t num_stack_entries,
+ u32 alloc_stack_hash)
{
struct kfence_metadata *meta = NULL;
unsigned long flags;
@@ -271,8 +370,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
list_del_init(&meta->list);
}
raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
- if (!meta)
+ if (!meta) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
return NULL;
+ }
if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
/*
@@ -314,11 +415,14 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
addr = (void *)meta->addr;
/* Update remaining metadata. */
- metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
WRITE_ONCE(meta->cache, cache);
meta->size = size;
- for_each_canary(meta, set_canary_byte);
+ meta->alloc_stack_hash = alloc_stack_hash;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ alloc_covered_add(alloc_stack_hash, 1);
/* Set required struct page fields. */
page = virt_to_page(meta->addr);
@@ -328,9 +432,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
if (IS_ENABLED(CONFIG_SLAB))
page->s_mem = addr;
- raw_spin_unlock_irqrestore(&meta->lock, flags);
-
/* Memory initialization. */
+ for_each_canary(meta, set_canary_byte);
/*
* We check slab_want_init_on_alloc() ourselves, rather than letting
@@ -355,6 +458,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
{
struct kcsan_scoped_access assert_page_exclusive;
unsigned long flags;
+ bool init;
raw_spin_lock_irqsave(&meta->lock, flags);
@@ -382,6 +486,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
meta->unprotected_page = 0;
}
+ /* Mark the object as freed. */
+ metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
+ init = slab_want_init_on_free(meta->cache);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ alloc_covered_add(meta->alloc_stack_hash, -1);
+
/* Check canary bytes for memory corruption. */
for_each_canary(meta, check_canary_byte);
@@ -390,14 +501,9 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
* data is still there, and after a use-after-free is detected, we
* unprotect the page, so the data is still accessible.
*/
- if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
+ if (!zombie && unlikely(init))
memzero_explicit(addr, meta->size);
- /* Mark the object as freed. */
- metadata_update_state(meta, KFENCE_OBJECT_FREED);
-
- raw_spin_unlock_irqrestore(&meta->lock, flags);
-
/* Protect to detect use-after-frees. */
kfence_protect((unsigned long)addr);
@@ -663,11 +769,14 @@ void __init kfence_init(void)
if (!kfence_sample_interval)
return;
+ stack_hash_seed = (u32)random_get_entropy();
if (!kfence_init_pool()) {
pr_err("%s failed\n", __func__);
return;
}
+ if (!IS_ENABLED(CONFIG_KFENCE_STATIC_KEYS))
+ static_branch_enable(&kfence_allocation_key);
WRITE_ONCE(kfence_enabled, true);
queue_delayed_work(system_unbound_wq, &kfence_timer, 0);
pr_info("initialized - using %lu bytes for %d objects at 0x%p-0x%p\n", KFENCE_POOL_SIZE,
@@ -736,12 +845,18 @@ void kfence_shutdown_cache(struct kmem_cache *s)
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
+ unsigned long stack_entries[KFENCE_STACK_DEPTH];
+ size_t num_stack_entries;
+ u32 alloc_stack_hash;
+
/*
* Perform size check before switching kfence_allocation_gate, so that
* we don't disable KFENCE without making an allocation.
*/
- if (size > PAGE_SIZE)
+ if (size > PAGE_SIZE) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
+ }
/*
* Skip allocations from non-default zones, including DMA. We cannot
@@ -749,15 +864,12 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
* properties (e.g. reside in DMAable memory).
*/
if ((flags & GFP_ZONEMASK) ||
- (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+ (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
+ }
- /*
- * allocation_gate only needs to become non-zero, so it doesn't make
- * sense to continue writing to it and pay the associated contention
- * cost, in case we have a large number of concurrent allocations.
- */
- if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
+ if (atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
/*
@@ -776,7 +888,25 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
if (!READ_ONCE(kfence_enabled))
return NULL;
- return kfence_guarded_alloc(s, size, flags);
+ num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
+
+ /*
+ * Do expensive check for coverage of allocation in slow-path after
+ * allocation_gate has already become non-zero, even though it might
+ * mean not making any allocation within a given sample interval.
+ *
+ * This ensures reasonable allocation coverage when the pool is almost
+ * full, including avoiding long-lived allocations of the same source
+ * filling up the pool (e.g. pagecache allocations).
+ */
+ alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+ if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+ atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+ return NULL;
+ }
+
+ return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+ alloc_stack_hash);
}
size_t kfence_ksize(const void *addr)
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index c1f23c61e5f9..2a2d5de9d379 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -87,6 +87,8 @@ struct kfence_metadata {
/* Allocation and free stack information. */
struct kfence_track alloc_track;
struct kfence_track free_track;
+ /* For updating alloc_covered on frees. */
+ u32 alloc_stack_hash;
};
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index f1690cf54199..695030c1fff8 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -32,6 +32,11 @@
#define arch_kfence_test_address(addr) (addr)
#endif
+#define KFENCE_TEST_REQUIRES(test, cond) do { \
+ if (!(cond)) \
+ kunit_skip((test), "Test requires: " #cond); \
+} while (0)
+
/* Report as observed from console. */
static struct {
spinlock_t lock;
@@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test)
};
int i;
- if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
- return;
+ KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON));
/* Assume it hasn't been disabled on command line. */
setup_test_cache(test, size, 0, NULL);
@@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test)
char *buf1, *buf2;
int i;
- if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
- kunit_warn(test, "skipping ... would take too long\n");
- return;
- }
+ /* Skip if we think it'd take too long. */
+ KFENCE_TEST_REQUIRES(test, CONFIG_KFENCE_SAMPLE_INTERVAL <= 100);
setup_test_cache(test, size, 0, NULL);
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 045cc579f724..e99101162f1a 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -445,22 +445,25 @@ static bool hugepage_vma_check(struct vm_area_struct *vma,
if (!transhuge_vma_enabled(vma, vm_flags))
return false;
+ if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
+ vma->vm_pgoff, HPAGE_PMD_NR))
+ return false;
+
/* Enabled via shmem mount options or sysfs settings. */
- if (shmem_file(vma->vm_file) && shmem_huge_enabled(vma)) {
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
- }
+ if (shmem_file(vma->vm_file))
+ return shmem_huge_enabled(vma);
/* THP settings require madvise. */
if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
return false;
- /* Read-only file mappings need to be aligned for THP to work. */
+ /* Only regular file is valid */
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && vma->vm_file &&
- !inode_is_open_for_write(vma->vm_file->f_inode) &&
(vm_flags & VM_EXEC)) {
- return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
- HPAGE_PMD_NR);
+ struct inode *inode = vma->vm_file->f_inode;
+
+ return !inode_is_open_for_write(inode) &&
+ S_ISREG(inode->i_mode);
}
if (!vma->anon_vma || vma->vm_ops)
@@ -1087,7 +1090,7 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
- if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+ if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
@@ -1211,7 +1214,7 @@ out_up_write:
mmap_write_unlock(mm);
out_nolock:
if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(*hpage);
+ mem_cgroup_uncharge(page_folio(*hpage));
trace_mm_collapse_huge_page(mm, isolated, result);
return;
}
@@ -1658,7 +1661,7 @@ static void collapse_file(struct mm_struct *mm,
goto out;
}
- if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
+ if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
@@ -1763,6 +1766,10 @@ static void collapse_file(struct mm_struct *mm,
filemap_flush(mapping);
result = SCAN_FAIL;
goto xa_unlocked;
+ } else if (PageWriteback(page)) {
+ xas_unlock_irq(&xas);
+ result = SCAN_FAIL;
+ goto xa_unlocked;
} else if (trylock_page(page)) {
get_page(page);
xas_unlock_irq(&xas);
@@ -1798,7 +1805,8 @@ static void collapse_file(struct mm_struct *mm,
goto out_unlock;
}
- if (!is_shmem && PageDirty(page)) {
+ if (!is_shmem && (PageDirty(page) ||
+ PageWriteback(page))) {
/*
* khugepaged only works on read-only fd, so this
* page is dirty because it hasn't been flushed
@@ -1975,7 +1983,7 @@ xa_unlocked:
out:
VM_BUG_ON(!list_empty(&pagelist));
if (!IS_ERR_OR_NULL(*hpage))
- mem_cgroup_uncharge(*hpage);
+ mem_cgroup_uncharge(page_folio(*hpage));
/* TODO: tracepoints */
}
@@ -2291,6 +2299,11 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
+ if (!khugepaged_enabled()) {
+ calculate_min_free_kbytes();
+ goto update_wmarks;
+ }
+
for_each_populated_zone(zone) {
/*
* We don't need to worry about fragmentation of
@@ -2326,6 +2339,8 @@ static void set_recommended_min_free_kbytes(void)
min_free_kbytes = recommended_min;
}
+
+update_wmarks:
setup_per_zone_wmarks();
}
@@ -2347,12 +2362,11 @@ int start_stop_khugepaged(void)
if (!list_empty(&khugepaged_scan.mm_head))
wake_up_interruptible(&khugepaged_wait);
-
- set_recommended_min_free_kbytes();
} else if (khugepaged_thread) {
kthread_stop(khugepaged_thread);
khugepaged_thread = NULL;
}
+ set_recommended_min_free_kbytes();
fail:
mutex_unlock(&khugepaged_mutex);
return err;
diff --git a/mm/ksm.c b/mm/ksm.c
index a5716fdec1aa..0662093237e4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ stale:
/*
* We come here from above when page->mapping or !PageSwapCache
* suggests that the node is stale; but it might be under migration.
- * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
+ * We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
@@ -852,9 +852,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
+static inline struct stable_node *folio_stable_node(struct folio *folio)
+{
+ return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
+}
+
static inline struct stable_node *page_stable_node(struct page *page)
{
- return PageKsm(page) ? page_rmapping(page) : NULL;
+ return folio_stable_node(page_folio(page));
}
static inline void set_page_stable_node(struct page *page,
@@ -2578,7 +2583,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
- if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+ if (new_page &&
+ mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
put_page(new_page);
new_page = NULL;
}
@@ -2658,26 +2664,26 @@ again:
}
#ifdef CONFIG_MIGRATION
-void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
struct stable_node *stable_node;
- VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
+ VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
- stable_node = page_stable_node(newpage);
+ stable_node = folio_stable_node(folio);
if (stable_node) {
- VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
- stable_node->kpfn = page_to_pfn(newpage);
+ VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
+ stable_node->kpfn = folio_pfn(newfolio);
/*
- * newpage->mapping was set in advance; now we need smp_wmb()
+ * newfolio->mapping was set in advance; now we need smp_wmb()
* to make sure that the new stable_node->kpfn is visible
- * to get_ksm_page() before it can see that oldpage->mapping
- * has gone stale (or that PageSwapCache has been cleared).
+ * to get_ksm_page() before it can see that folio->mapping
+ * has gone stale (or that folio_test_swapcache has been cleared).
*/
smp_wmb();
- set_page_stable_node(oldpage, NULL);
+ set_page_stable_node(&folio->page, NULL);
}
}
#endif /* CONFIG_MIGRATION */
diff --git a/mm/list_lru.c b/mm/list_lru.c
index cd58790d0fb3..0cd5e89ca063 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -15,18 +15,29 @@
#include "slab.h"
#ifdef CONFIG_MEMCG_KMEM
-static LIST_HEAD(list_lrus);
+static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return lru->memcg_aware;
+}
+
static void list_lru_register(struct list_lru *lru)
{
+ if (!list_lru_memcg_aware(lru))
+ return;
+
mutex_lock(&list_lrus_mutex);
- list_add(&lru->list, &list_lrus);
+ list_add(&lru->list, &memcg_list_lrus);
mutex_unlock(&list_lrus_mutex);
}
static void list_lru_unregister(struct list_lru *lru)
{
+ if (!list_lru_memcg_aware(lru))
+ return;
+
mutex_lock(&list_lrus_mutex);
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
@@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru)
return lru->shrinker_id;
}
-static inline bool list_lru_memcg_aware(struct list_lru *lru)
-{
- return lru->memcg_aware;
-}
-
static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
@@ -176,13 +182,16 @@ unsigned long list_lru_count_one(struct list_lru *lru,
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
- unsigned long count;
+ long count;
rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
count = READ_ONCE(l->nr_items);
rcu_read_unlock();
+ if (unlikely(count < 0))
+ count = 0;
+
return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);
@@ -354,8 +363,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
int size = memcg_nr_cache_ids;
- memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
- size * sizeof(void *), GFP_KERNEL);
+ memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL);
if (!memcg_lrus)
return -ENOMEM;
@@ -389,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
old = rcu_dereference_protected(nlru->memcg_lrus,
lockdep_is_held(&list_lrus_mutex));
- new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
+ new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL);
if (!new)
return -ENOMEM;
@@ -398,19 +406,8 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
return -ENOMEM;
}
- memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
-
- /*
- * The locking below allows readers that hold nlru->lock avoid taking
- * rcu_read_lock (see list_lru_from_memcg_idx).
- *
- * Since list_lru_{add,del} may be called under an IRQ-safe lock,
- * we have to use IRQ-safe primitives here to avoid deadlock.
- */
- spin_lock_irq(&nlru->lock);
+ memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size));
rcu_assign_pointer(nlru->memcg_lrus, new);
- spin_unlock_irq(&nlru->lock);
-
kvfree_rcu(old, rcu);
return 0;
}
@@ -466,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru,
{
int i;
- if (!list_lru_memcg_aware(lru))
- return 0;
-
for_each_node(i) {
if (memcg_update_list_lru_node(&lru->node[i],
old_size, new_size))
@@ -491,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
{
int i;
- if (!list_lru_memcg_aware(lru))
- return;
-
for_each_node(i)
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
@@ -506,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size)
int old_size = memcg_nr_cache_ids;
mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &list_lrus, list) {
+ list_for_each_entry(lru, &memcg_list_lrus, list) {
ret = memcg_update_list_lru(lru, old_size, new_size);
if (ret)
goto fail;
@@ -515,7 +506,7 @@ out:
mutex_unlock(&list_lrus_mutex);
return ret;
fail:
- list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+ list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list)
memcg_cancel_update_list_lru(lru, old_size, new_size);
goto out;
}
@@ -552,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru,
{
int i;
- if (!list_lru_memcg_aware(lru))
- return;
-
for_each_node(i)
memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
}
@@ -564,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
struct list_lru *lru;
mutex_lock(&list_lrus_mutex);
- list_for_each_entry(lru, &list_lrus, list)
+ list_for_each_entry(lru, &memcg_list_lrus, list)
memcg_drain_list_lru(lru, src_idx, dst_memcg);
mutex_unlock(&list_lrus_mutex);
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 184dcd2e5d99..1018e50566f3 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -287,7 +287,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
{
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
- end == MEMBLOCK_ALLOC_KASAN)
+ end == MEMBLOCK_ALLOC_NOLEAKTRACE)
end = memblock.current_limit;
/* avoid allocating the first page */
@@ -366,14 +366,14 @@ void __init memblock_discard(void)
addr = __pa(memblock.reserved.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max);
- __memblock_free_late(addr, size);
+ memblock_free_late(addr, size);
}
if (memblock.memory.regions != memblock_memory_init_regions) {
addr = __pa(memblock.memory.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max);
- __memblock_free_late(addr, size);
+ memblock_free_late(addr, size);
}
memblock_memory = NULL;
@@ -472,7 +472,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
- memblock_free_ptr(old_array, old_alloc_size);
+ memblock_free(old_array, old_alloc_size);
/*
* Reserve the new array if that comes from the memblock. Otherwise, we
@@ -655,6 +655,7 @@ repeat:
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
+ * @flags: flags of the new region
*
* Add new memblock region [@base, @base + @size) to the "memory"
* type. See memblock_add_range() description for mode details
@@ -663,14 +664,14 @@ repeat:
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
- int nid)
+ int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__,
- &base, &end, nid, (void *)_RET_IP_);
+ memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
+ &base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.memory, base, size, nid, 0);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
/**
@@ -796,28 +797,28 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
}
/**
- * memblock_free_ptr - free boot memory allocation
+ * memblock_free - free boot memory allocation
* @ptr: starting address of the boot memory allocation
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
-void __init_memblock memblock_free_ptr(void *ptr, size_t size)
+void __init_memblock memblock_free(void *ptr, size_t size)
{
if (ptr)
- memblock_free(__pa(ptr), size);
+ memblock_phys_free(__pa(ptr), size);
}
/**
- * memblock_free - free boot memory block
+ * memblock_phys_free - free boot memory block
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
-int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
@@ -932,6 +933,9 @@ int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
* covered by the memory map. The struct page representing NOMAP memory
* frames in the memory map will be PageReserved()
*
+ * Note: if the memory being marked %MEMBLOCK_NOMAP was allocated from
+ * memblock, the caller must inform kmemleak to ignore that memory
+ *
* Return: 0 on success, -errno on failure.
*/
int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size)
@@ -978,6 +982,10 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
return true;
+ /* skip driver-managed memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
+ return true;
+
return false;
}
@@ -1379,8 +1387,11 @@ again:
return 0;
done:
- /* Skip kmemleak for kasan_init() due to high volume. */
- if (end != MEMBLOCK_ALLOC_KASAN)
+ /*
+ * Skip kmemleak for those places like kasan_init() and
+ * early_pgtable_alloc() due to high volume.
+ */
+ if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
* The min_count is set to 0 so that memblock allocated
* blocks are never reported as leaks. This is because many
@@ -1586,7 +1597,7 @@ void * __init memblock_alloc_try_nid(
}
/**
- * __memblock_free_late - free pages directly to buddy allocator
+ * memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
@@ -1594,7 +1605,7 @@ void * __init memblock_alloc_try_nid(
* down, but we are still initializing the system. Pages are released directly
* to the buddy allocator.
*/
-void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
+void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
{
phys_addr_t cursor, end;
@@ -1687,7 +1698,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size)
if (!size)
return;
- if (memblock.memory.cnt <= 1) {
+ if (!memblock_memory->total_size) {
pr_warn("%s: No memory registered yet\n", __func__);
return;
}
@@ -1934,7 +1945,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
* memmap array.
*/
if (pg < pgend)
- memblock_free(pg, pgend - pg);
+ memblock_phys_free(pg, pgend - pg);
}
/*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6da5020a8656..508bcea7df56 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -103,11 +103,6 @@ static bool do_memsw_account(void)
return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
-/* memcg and lruvec stats flushing */
-static void flush_memcg_stats_dwork(struct work_struct *w);
-static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
-static DEFINE_SPINLOCK(stats_flush_lock);
-
#define THRESHOLDS_EVENTS_TARGET 128
#define SOFTLIMIT_EVENTS_TARGET 1024
@@ -239,7 +234,7 @@ enum res_type {
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
-static inline bool should_force_charge(void)
+static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
(current->flags & PF_EXITING);
@@ -456,28 +451,6 @@ ino_t page_cgroup_ino(struct page *page)
return ino;
}
-static struct mem_cgroup_per_node *
-mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
-{
- int nid = page_to_nid(page);
-
- return memcg->nodeinfo[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_node(int nid)
-{
- return soft_limit_tree.rb_tree_per_node[nid];
-}
-
-static struct mem_cgroup_tree_per_node *
-soft_limit_tree_from_page(struct page *page)
-{
- int nid = page_to_nid(page);
-
- return soft_limit_tree.rb_tree_per_node[nid];
-}
-
static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
struct mem_cgroup_tree_per_node *mctz,
unsigned long new_usage_in_excess)
@@ -548,13 +521,13 @@ static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
return excess;
}
-static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
{
unsigned long excess;
struct mem_cgroup_per_node *mz;
struct mem_cgroup_tree_per_node *mctz;
- mctz = soft_limit_tree_from_page(page);
+ mctz = soft_limit_tree.rb_tree_per_node[nid];
if (!mctz)
return;
/*
@@ -562,7 +535,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
* because their event counter is not touched.
*/
for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- mz = mem_cgroup_page_nodeinfo(memcg, page);
+ mz = memcg->nodeinfo[nid];
excess = soft_limit_excess(memcg);
/*
* We have to update the tree if mz is on RB-tree or
@@ -593,7 +566,7 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
for_each_node(nid) {
mz = memcg->nodeinfo[nid];
- mctz = soft_limit_tree_node(nid);
+ mctz = soft_limit_tree.rb_tree_per_node[nid];
if (mctz)
mem_cgroup_remove_exceeded(mz, mctz);
}
@@ -635,6 +608,58 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
return mz;
}
+/*
+ * memcg and lruvec stats flushing
+ *
+ * Many codepaths leading to stats update or read are performance sensitive and
+ * adding stats flushing in such codepaths is not desirable. So, to optimize the
+ * flushing the kernel does:
+ *
+ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
+ * rstat update tree grow unbounded.
+ *
+ * 2) Flush the stats synchronously on reader side only when there are more than
+ * (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
+ * will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
+ * only for 2 seconds due to (1).
+ */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static DEFINE_SPINLOCK(stats_flush_lock);
+static DEFINE_PER_CPU(unsigned int, stats_updates);
+static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+
+static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
+{
+ cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+ if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
+ atomic_inc(&stats_flush_threshold);
+}
+
+static void __mem_cgroup_flush_stats(void)
+{
+ unsigned long flag;
+
+ if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+ return;
+
+ cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+ atomic_set(&stats_flush_threshold, 0);
+ spin_unlock_irqrestore(&stats_flush_lock, flag);
+}
+
+void mem_cgroup_flush_stats(void)
+{
+ if (atomic_read(&stats_flush_threshold) > num_online_cpus())
+ __mem_cgroup_flush_stats();
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+ mem_cgroup_flush_stats();
+ queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
/**
* __mod_memcg_state - update cgroup memory statistics
* @memcg: the memory cgroup
@@ -647,7 +672,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
return;
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
- cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+ memcg_rstat_updated(memcg);
}
/* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -675,10 +700,12 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
memcg = pn->memcg;
/* Update memcg */
- __mod_memcg_state(memcg, idx, val);
+ __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
/* Update lruvec */
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+
+ memcg_rstat_updated(memcg);
}
/**
@@ -780,7 +807,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
return;
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
- cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+ memcg_rstat_updated(memcg);
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -799,7 +826,6 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
- struct page *page,
int nr_pages)
{
/* pagein of a big page is an event. So, ignore page size */
@@ -842,7 +868,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
* Check events in order.
*
*/
-static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, int nid)
{
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
@@ -853,7 +879,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
MEM_CGROUP_TARGET_SOFTLIMIT);
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
- mem_cgroup_update_tree(memcg, page);
+ mem_cgroup_update_tree(memcg, nid);
}
}
@@ -1149,64 +1175,88 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
}
#ifdef CONFIG_DEBUG_VM
-void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
{
struct mem_cgroup *memcg;
if (mem_cgroup_disabled())
return;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
if (!memcg)
- VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+ VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
else
- VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
+ VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
}
#endif
/**
- * lock_page_lruvec - lock and return lruvec for a given page.
- * @page: the page
+ * folio_lruvec_lock - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
*
* These functions are safe to use under any of the following conditions:
- * - page locked
- * - PageLRU cleared
- * - lock_page_memcg()
- * - page->_refcount is zero
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held.
*/
-struct lruvec *lock_page_lruvec(struct page *page)
+struct lruvec *folio_lruvec_lock(struct folio *folio)
{
- struct lruvec *lruvec;
+ struct lruvec *lruvec = folio_lruvec(folio);
- lruvec = mem_cgroup_page_lruvec(page);
spin_lock(&lruvec->lru_lock);
-
- lruvec_memcg_debug(lruvec, page);
+ lruvec_memcg_debug(lruvec, folio);
return lruvec;
}
-struct lruvec *lock_page_lruvec_irq(struct page *page)
+/**
+ * folio_lruvec_lock_irq - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
{
- struct lruvec *lruvec;
+ struct lruvec *lruvec = folio_lruvec(folio);
- lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irq(&lruvec->lru_lock);
-
- lruvec_memcg_debug(lruvec, page);
+ lruvec_memcg_debug(lruvec, folio);
return lruvec;
}
-struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+/**
+ * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
+ * @folio: Pointer to the folio.
+ * @flags: Pointer to irqsave flags.
+ *
+ * These functions are safe to use under any of the following conditions:
+ * - folio locked
+ * - folio_test_lru false
+ * - folio_memcg_lock()
+ * - folio frozen (refcount of 0)
+ *
+ * Return: The lruvec this folio is on with its lock held and interrupts
+ * disabled.
+ */
+struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
+ unsigned long *flags)
{
- struct lruvec *lruvec;
+ struct lruvec *lruvec = folio_lruvec(folio);
- lruvec = mem_cgroup_page_lruvec(page);
spin_lock_irqsave(&lruvec->lru_lock, *flags);
-
- lruvec_memcg_debug(lruvec, page);
+ lruvec_memcg_debug(lruvec, folio);
return lruvec;
}
@@ -1414,7 +1464,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*
* Current memory state:
*/
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
u64 size;
@@ -1575,7 +1625,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
- ret = should_force_charge() || out_of_memory(&oc);
+ ret = task_is_dying() || out_of_memory(&oc);
unlock:
mutex_unlock(&oom_lock);
@@ -1956,18 +2006,17 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
/**
- * lock_page_memcg - lock a page and memcg binding
- * @page: the page
+ * folio_memcg_lock - Bind a folio to its memcg.
+ * @folio: The folio.
*
- * This function protects unlocked LRU pages from being moved to
+ * This function prevents unlocked LRU folios from being moved to
* another cgroup.
*
- * It ensures lifetime of the locked memcg. Caller is responsible
- * for the lifetime of the page.
+ * It ensures lifetime of the bound memcg. The caller is responsible
+ * for the lifetime of the folio.
*/
-void lock_page_memcg(struct page *page)
+void folio_memcg_lock(struct folio *folio)
{
- struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1981,7 +2030,7 @@ void lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return;
again:
- memcg = page_memcg(head);
+ memcg = folio_memcg(folio);
if (unlikely(!memcg))
return;
@@ -1995,7 +2044,7 @@ again:
return;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != page_memcg(head)) {
+ if (memcg != folio_memcg(folio)) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -2009,9 +2058,15 @@ again:
memcg->move_lock_task = current;
memcg->move_lock_flags = flags;
}
+EXPORT_SYMBOL(folio_memcg_lock);
+
+void lock_page_memcg(struct page *page)
+{
+ folio_memcg_lock(page_folio(page));
+}
EXPORT_SYMBOL(lock_page_memcg);
-static void __unlock_page_memcg(struct mem_cgroup *memcg)
+static void __folio_memcg_unlock(struct mem_cgroup *memcg)
{
if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags;
@@ -2026,14 +2081,22 @@ static void __unlock_page_memcg(struct mem_cgroup *memcg)
}
/**
- * unlock_page_memcg - unlock a page and memcg binding
- * @page: the page
+ * folio_memcg_unlock - Release the binding between a folio and its memcg.
+ * @folio: The folio.
+ *
+ * This releases the binding created by folio_memcg_lock(). This does
+ * not change the accounting of this folio to its memcg, but it does
+ * permit others to change it.
*/
-void unlock_page_memcg(struct page *page)
+void folio_memcg_unlock(struct folio *folio)
{
- struct page *head = compound_head(page);
+ __folio_memcg_unlock(folio_memcg(folio));
+}
+EXPORT_SYMBOL(folio_memcg_unlock);
- __unlock_page_memcg(page_memcg(head));
+void unlock_page_memcg(struct page *page)
+{
+ folio_memcg_unlock(page_folio(page));
}
EXPORT_SYMBOL(unlock_page_memcg);
@@ -2530,6 +2593,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct page_counter *counter;
enum oom_status oom_status;
unsigned long nr_reclaimed;
+ bool passed_oom = false;
bool may_swap = true;
bool drained = false;
unsigned long pflags;
@@ -2565,15 +2629,6 @@ retry:
goto force;
/*
- * Unlike in global OOM situations, memcg is not in a physical
- * memory shortage. Allow dying and OOM-killed tasks to
- * bypass the last charges so that they can exit quickly and
- * free their memory.
- */
- if (unlikely(should_force_charge()))
- goto force;
-
- /*
* Prevent unbounded recursion when reclaim operations need to
* allocate memory. This might exceed the limits temporarily,
* but we prefer facilitating memory reclaim and getting back
@@ -2630,8 +2685,9 @@ retry:
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
- if (fatal_signal_pending(current))
- goto force;
+ /* Avoid endless loop for tasks bypassed by the oom killer */
+ if (passed_oom && task_is_dying())
+ goto nomem;
/*
* keep retrying as long as the memcg oom killer is able to make
@@ -2640,14 +2696,10 @@ retry:
*/
oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
get_order(nr_pages * PAGE_SIZE));
- switch (oom_status) {
- case OOM_SUCCESS:
+ if (oom_status == OOM_SUCCESS) {
+ passed_oom = true;
nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
- case OOM_FAILED:
- goto force;
- default:
- goto nomem;
}
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
@@ -2722,8 +2774,7 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
return try_charge_memcg(memcg, gfp_mask, nr_pages);
}
-#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
-static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
return;
@@ -2732,11 +2783,10 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
}
-#endif
-static void commit_charge(struct page *page, struct mem_cgroup *memcg)
+static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
- VM_BUG_ON_PAGE(page_memcg(page), page);
+ VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
/*
* Any of the following ensures page's memcg stability:
*
@@ -2745,7 +2795,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
* - lock_page_memcg()
* - exclusive reference
*/
- page->memcg_data = (unsigned long)memcg;
+ folio->memcg_data = (unsigned long)memcg;
}
static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
@@ -2951,7 +3001,6 @@ static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
unsigned int nr_pages)
{
- struct page_counter *counter;
struct mem_cgroup *memcg;
int ret;
@@ -2961,21 +3010,8 @@ static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
if (ret)
goto out;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
- !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
-
- /*
- * Enforce __GFP_NOFAIL allocation because callers are not
- * prepared to see failures and likely do not have any failure
- * handling code.
- */
- if (gfp & __GFP_NOFAIL) {
- page_counter_charge(&memcg->kmem, nr_pages);
- goto out;
- }
- cancel_charge(memcg, nr_pages);
- ret = -ENOMEM;
- }
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_charge(&memcg->kmem, nr_pages);
out:
css_put(&memcg->css);
@@ -3015,15 +3051,16 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
+ struct folio *folio = page_folio(page);
struct obj_cgroup *objcg;
unsigned int nr_pages = 1 << order;
- if (!PageMemcgKmem(page))
+ if (!folio_memcg_kmem(folio))
return;
- objcg = __page_objcg(page);
+ objcg = __folio_objcg(folio);
obj_cgroup_uncharge_pages(objcg, nr_pages);
- page->memcg_data = 0;
+ folio->memcg_data = 0;
obj_cgroup_put(objcg);
}
@@ -3257,17 +3294,18 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
*/
void split_page_memcg(struct page *head, unsigned int nr)
{
- struct mem_cgroup *memcg = page_memcg(head);
+ struct folio *folio = page_folio(head);
+ struct mem_cgroup *memcg = folio_memcg(folio);
int i;
if (mem_cgroup_disabled() || !memcg)
return;
for (i = 1; i < nr; i++)
- head[i].memcg_data = head->memcg_data;
+ folio_page(folio, i)->memcg_data = folio->memcg_data;
- if (PageMemcgKmem(head))
- obj_cgroup_get_many(__page_objcg(head), nr - 1);
+ if (folio_memcg_kmem(folio))
+ obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
else
css_get_many(&memcg->css, nr - 1);
}
@@ -3381,7 +3419,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
if (order > 0)
return 0;
- mctz = soft_limit_tree_node(pgdat->node_id);
+ mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
/*
* Do not even bother to check the largest node if the root
@@ -3465,19 +3503,11 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
/* try to free all pages in this cgroup */
while (nr_retries && page_counter_read(&memcg->memory)) {
- int progress;
-
if (signal_pending(current))
return -EINTR;
- progress = try_to_free_mem_cgroup_pages(memcg, 1,
- GFP_KERNEL, true);
- if (!progress) {
+ if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
nr_retries--;
- /* maybe some writeback is necessary */
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- }
-
}
return 0;
@@ -3518,8 +3548,7 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- /* mem_cgroup_threshold() calls here from irqsafe context */
- cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
val = memcg_page_state(memcg, NR_FILE_PAGES) +
memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
@@ -3594,7 +3623,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
return 0;
BUG_ON(memcg->kmemcg_id >= 0);
- BUG_ON(memcg->kmem_state);
memcg_id = memcg_alloc_cache_id();
if (memcg_id < 0)
@@ -3611,22 +3639,18 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static_branch_enable(&memcg_kmem_enabled_key);
memcg->kmemcg_id = memcg_id;
- memcg->kmem_state = KMEM_ONLINE;
return 0;
}
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
- struct cgroup_subsys_state *css;
- struct mem_cgroup *parent, *child;
+ struct mem_cgroup *parent;
int kmemcg_id;
- if (memcg->kmem_state != KMEM_ONLINE)
+ if (memcg->kmemcg_id == -1)
return;
- memcg->kmem_state = KMEM_ALLOCATED;
-
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
@@ -3637,31 +3661,15 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
BUG_ON(kmemcg_id < 0);
/*
- * Change kmemcg_id of this cgroup and all its descendants to the
- * parent's id, and then move all entries from this cgroup's list_lrus
- * to ones of the parent. After we have finished, all list_lrus
- * corresponding to this cgroup are guaranteed to remain empty. The
- * ordering is imposed by list_lru_node->lock taken by
+ * After we have finished memcg_reparent_objcgs(), all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty.
+ * The ordering is imposed by list_lru_node->lock taken by
* memcg_drain_all_list_lrus().
*/
- rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
- css_for_each_descendant_pre(css, &memcg->css) {
- child = mem_cgroup_from_css(css);
- BUG_ON(child->kmemcg_id != kmemcg_id);
- child->kmemcg_id = parent->kmemcg_id;
- }
- rcu_read_unlock();
-
memcg_drain_all_list_lrus(kmemcg_id, parent);
memcg_free_cache_id(kmemcg_id);
-}
-
-static void memcg_free_kmem(struct mem_cgroup *memcg)
-{
- /* css_alloc() failed, offlining didn't happen */
- if (unlikely(memcg->kmem_state == KMEM_ONLINE))
- memcg_offline_kmem(memcg);
+ memcg->kmemcg_id = -1;
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -3671,22 +3679,8 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
}
-static void memcg_free_kmem(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
-static int memcg_update_kmem_max(struct mem_cgroup *memcg,
- unsigned long max)
-{
- int ret;
-
- mutex_lock(&memcg_max_mutex);
- ret = page_counter_set_max(&memcg->kmem, max);
- mutex_unlock(&memcg_max_mutex);
- return ret;
-}
-
static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
{
int ret;
@@ -3752,10 +3746,8 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
- pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
- "Please report your usecase to linux-mm@kvack.org if you "
- "depend on this functionality.\n");
- ret = memcg_update_kmem_max(memcg, nr_pages);
+ /* kmem.limit_in_bytes is deprecated. */
+ ret = -EOPNOTSUPP;
break;
case _TCP:
ret = memcg_update_tcp_max(memcg, nr_pages);
@@ -3900,7 +3892,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
int nid;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
seq_printf(m, "%s=%lu", stat->name,
@@ -3972,7 +3964,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
unsigned long nr;
@@ -4475,7 +4467,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
*pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
*pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
@@ -4537,17 +4529,17 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
* As being wrong occasionally doesn't matter, updates and accesses to the
* records are lockless and racy.
*/
-void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = page_memcg(page);
+ struct mem_cgroup *memcg = folio_memcg(folio);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
int oldest = -1;
int i;
- trace_track_foreign_dirty(page, wb);
+ trace_track_foreign_dirty(folio, wb);
/*
* Pick the slot to use. If there is already a slot for @wb, keep
@@ -5308,7 +5300,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
free_shrinker_info(memcg);
- memcg_free_kmem(memcg);
+
+ /* Need to offline kmem if online_css() fails */
+ memcg_offline_kmem(memcg);
mem_cgroup_free(memcg);
}
@@ -5341,21 +5335,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
memcg_wb_domain_size_changed(memcg);
}
-void mem_cgroup_flush_stats(void)
-{
- if (!spin_trylock(&stats_flush_lock))
- return;
-
- cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
- spin_unlock(&stats_flush_lock);
-}
-
-static void flush_memcg_stats_dwork(struct work_struct *w)
-{
- mem_cgroup_flush_stats();
- queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
-}
-
static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5545,7 +5524,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
#endif
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
- unsigned long addr, pte_t ptent, swp_entry_t *entry)
+ unsigned long addr, pte_t ptent)
{
if (!vma->vm_file) /* anonymous vma */
return NULL;
@@ -5575,38 +5554,39 @@ static int mem_cgroup_move_account(struct page *page,
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct folio *folio = page_folio(page);
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
- unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
- int ret;
+ unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
+ int nid, ret;
VM_BUG_ON(from == to);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON(compound && !PageTransHuge(page));
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ VM_BUG_ON(compound && !folio_test_multi(folio));
/*
* Prevent mem_cgroup_migrate() from looking at
* page's memory cgroup of its source page while we change it.
*/
ret = -EBUSY;
- if (!trylock_page(page))
+ if (!folio_trylock(folio))
goto out;
ret = -EINVAL;
- if (page_memcg(page) != from)
+ if (folio_memcg(folio) != from)
goto out_unlock;
- pgdat = page_pgdat(page);
+ pgdat = folio_pgdat(folio);
from_vec = mem_cgroup_lruvec(from, pgdat);
to_vec = mem_cgroup_lruvec(to, pgdat);
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
- if (PageAnon(page)) {
- if (page_mapped(page)) {
+ if (folio_test_anon(folio)) {
+ if (folio_mapped(folio)) {
__mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
- if (PageTransHuge(page)) {
+ if (folio_test_transhuge(folio)) {
__mod_lruvec_state(from_vec, NR_ANON_THPS,
-nr_pages);
__mod_lruvec_state(to_vec, NR_ANON_THPS,
@@ -5617,18 +5597,18 @@ static int mem_cgroup_move_account(struct page *page,
__mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
- if (PageSwapBacked(page)) {
+ if (folio_test_swapbacked(folio)) {
__mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
__mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
}
- if (page_mapped(page)) {
+ if (folio_mapped(folio)) {
__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
- if (PageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ if (folio_test_dirty(folio)) {
+ struct address_space *mapping = folio_mapping(folio);
if (mapping_can_writeback(mapping)) {
__mod_lruvec_state(from_vec, NR_FILE_DIRTY,
@@ -5639,7 +5619,7 @@ static int mem_cgroup_move_account(struct page *page,
}
}
- if (PageWriteback(page)) {
+ if (folio_test_writeback(folio)) {
__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
@@ -5662,20 +5642,21 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
- page->memcg_data = (unsigned long)to;
+ folio->memcg_data = (unsigned long)to;
- __unlock_page_memcg(from);
+ __folio_memcg_unlock(from);
ret = 0;
+ nid = folio_nid(folio);
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, nr_pages);
- memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, -nr_pages);
- memcg_check_events(from, page);
+ mem_cgroup_charge_statistics(to, nr_pages);
+ memcg_check_events(to, nid);
+ mem_cgroup_charge_statistics(from, -nr_pages);
+ memcg_check_events(from, nid);
local_irq_enable();
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
out:
return ret;
}
@@ -5718,7 +5699,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, ptent, &ent);
else if (pte_none(ptent))
- page = mc_handle_file_pte(vma, addr, ptent, &ent);
+ page = mc_handle_file_pte(vma, addr, ptent);
if (!page && !ent.val)
return ret;
@@ -6373,7 +6354,7 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
int i;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- cgroup_rstat_flush(memcg->css.cgroup);
+ mem_cgroup_flush_stats();
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
int nid;
@@ -6680,9 +6661,10 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage)));
}
-static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
+static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
+ gfp_t gfp)
{
- unsigned int nr_pages = thp_nr_pages(page);
+ long nr_pages = folio_nr_pages(folio);
int ret;
ret = try_charge(memcg, gfp, nr_pages);
@@ -6690,38 +6672,23 @@ static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
goto out;
css_get(&memcg->css);
- commit_charge(page, memcg);
+ commit_charge(folio, memcg);
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, nr_pages);
- memcg_check_events(memcg, page);
+ mem_cgroup_charge_statistics(memcg, nr_pages);
+ memcg_check_events(memcg, folio_nid(folio));
local_irq_enable();
out:
return ret;
}
-/**
- * __mem_cgroup_charge - charge a newly allocated page to a cgroup
- * @page: page to charge
- * @mm: mm context of the victim
- * @gfp_mask: reclaim mode
- *
- * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary. if @mm is NULL, try to
- * charge to the active memcg.
- *
- * Do not use this for pages allocated for swapin.
- *
- * Returns 0 on success. Otherwise, an error code is returned.
- */
-int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask)
+int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
{
struct mem_cgroup *memcg;
int ret;
memcg = get_mem_cgroup_from_mm(mm);
- ret = charge_memcg(page, memcg, gfp_mask);
+ ret = charge_memcg(folio, memcg, gfp);
css_put(&memcg->css);
return ret;
@@ -6742,6 +6709,7 @@ int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry)
{
+ struct folio *folio = page_folio(page);
struct mem_cgroup *memcg;
unsigned short id;
int ret;
@@ -6756,7 +6724,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
memcg = get_mem_cgroup_from_mm(mm);
rcu_read_unlock();
- ret = charge_memcg(page, memcg, gfp);
+ ret = charge_memcg(folio, memcg, gfp);
css_put(&memcg->css);
return ret;
@@ -6800,7 +6768,7 @@ struct uncharge_gather {
unsigned long nr_memory;
unsigned long pgpgout;
unsigned long nr_kmem;
- struct page *dummy_page;
+ int nid;
};
static inline void uncharge_gather_clear(struct uncharge_gather *ug)
@@ -6824,36 +6792,36 @@ static void uncharge_batch(const struct uncharge_gather *ug)
local_irq_save(flags);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
__this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
- memcg_check_events(ug->memcg, ug->dummy_page);
+ memcg_check_events(ug->memcg, ug->nid);
local_irq_restore(flags);
- /* drop reference from uncharge_page */
+ /* drop reference from uncharge_folio */
css_put(&ug->memcg->css);
}
-static void uncharge_page(struct page *page, struct uncharge_gather *ug)
+static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
{
- unsigned long nr_pages;
+ long nr_pages;
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
- bool use_objcg = PageMemcgKmem(page);
+ bool use_objcg = folio_memcg_kmem(folio);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Nobody should be changing or seriously looking at
- * page memcg or objcg at this point, we have fully
- * exclusive access to the page.
+ * folio memcg or objcg at this point, we have fully
+ * exclusive access to the folio.
*/
if (use_objcg) {
- objcg = __page_objcg(page);
+ objcg = __folio_objcg(folio);
/*
* This get matches the put at the end of the function and
* kmem pages do not hold memcg references anymore.
*/
memcg = get_mem_cgroup_from_objcg(objcg);
} else {
- memcg = __page_memcg(page);
+ memcg = __folio_memcg(folio);
}
if (!memcg)
@@ -6865,19 +6833,19 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
uncharge_gather_clear(ug);
}
ug->memcg = memcg;
- ug->dummy_page = page;
+ ug->nid = folio_nid(folio);
/* pairs with css_put in uncharge_batch */
css_get(&memcg->css);
}
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
if (use_objcg) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
- page->memcg_data = 0;
+ folio->memcg_data = 0;
obj_cgroup_put(objcg);
} else {
/* LRU pages aren't accounted at the root level */
@@ -6885,28 +6853,22 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
- page->memcg_data = 0;
+ folio->memcg_data = 0;
}
css_put(&memcg->css);
}
-/**
- * __mem_cgroup_uncharge - uncharge a page
- * @page: page to uncharge
- *
- * Uncharge a page previously charged with __mem_cgroup_charge().
- */
-void __mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct folio *folio)
{
struct uncharge_gather ug;
- /* Don't touch page->lru of any random page, pre-check: */
- if (!page_memcg(page))
+ /* Don't touch folio->lru of any random page, pre-check: */
+ if (!folio_memcg(folio))
return;
uncharge_gather_clear(&ug);
- uncharge_page(page, &ug);
+ uncharge_folio(folio, &ug);
uncharge_batch(&ug);
}
@@ -6920,52 +6882,49 @@ void __mem_cgroup_uncharge(struct page *page)
void __mem_cgroup_uncharge_list(struct list_head *page_list)
{
struct uncharge_gather ug;
- struct page *page;
+ struct folio *folio;
uncharge_gather_clear(&ug);
- list_for_each_entry(page, page_list, lru)
- uncharge_page(page, &ug);
+ list_for_each_entry(folio, page_list, lru)
+ uncharge_folio(folio, &ug);
if (ug.memcg)
uncharge_batch(&ug);
}
/**
- * mem_cgroup_migrate - charge a page's replacement
- * @oldpage: currently circulating page
- * @newpage: replacement page
+ * mem_cgroup_migrate - Charge a folio's replacement.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
*
- * Charge @newpage as a replacement page for @oldpage. @oldpage will
+ * Charge @new as a replacement folio for @old. @old will
* be uncharged upon free.
*
- * Both pages must be locked, @newpage->mapping must be set up.
+ * Both folios must be locked, @new->mapping must be set up.
*/
-void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
- unsigned int nr_pages;
+ long nr_pages = folio_nr_pages(new);
unsigned long flags;
- VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
- VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
- VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
- VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
- newpage);
+ VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+ VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+ VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+ VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
if (mem_cgroup_disabled())
return;
- /* Page cache replacement: new page already charged? */
- if (page_memcg(newpage))
+ /* Page cache replacement: new folio already charged? */
+ if (folio_memcg(new))
return;
- memcg = page_memcg(oldpage);
- VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
+ memcg = folio_memcg(old);
+ VM_WARN_ON_ONCE_FOLIO(!memcg, old);
if (!memcg)
return;
/* Force-charge the new page. The old one will be freed soon */
- nr_pages = thp_nr_pages(newpage);
-
if (!mem_cgroup_is_root(memcg)) {
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
@@ -6973,11 +6932,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
}
css_get(&memcg->css);
- commit_charge(newpage, memcg);
+ commit_charge(new, memcg);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
- memcg_check_events(memcg, newpage);
+ mem_cgroup_charge_statistics(memcg, nr_pages);
+ memcg_check_events(memcg, folio_nid(new));
local_irq_restore(flags);
}
@@ -7204,8 +7163,8 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, -nr_entries);
- memcg_check_events(memcg, page);
+ mem_cgroup_charge_statistics(memcg, -nr_entries);
+ memcg_check_events(memcg, page_to_nid(page));
css_put(&memcg->css);
}
diff --git a/mm/memfd.c b/mm/memfd.c
index 081dd33e6a61..9f80f162791a 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -297,9 +297,7 @@ SYSCALL_DEFINE2(memfd_create,
}
if (flags & MFD_HUGETLB) {
- struct ucounts *ucounts = NULL;
-
- file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3e6449f2102a..f64ebb6226cb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -39,6 +39,7 @@
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
+#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@@ -57,6 +58,7 @@
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
#include "internal.h"
#include "ras/ras_event.h"
@@ -673,7 +675,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
#define hwpoison_hugetlb_range NULL
#endif
-static struct mm_walk_ops hwp_walk_ops = {
+static const struct mm_walk_ops hwp_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
};
@@ -762,7 +764,7 @@ static int delete_from_lru_cache(struct page *p)
* Poisoned page might never drop its ref count to 0 so we have
* to uncharge it manually from its memcg.
*/
- mem_cgroup_uncharge(p);
+ mem_cgroup_uncharge(page_folio(p));
/*
* drop the page count elevated by isolate_lru_page()
@@ -806,12 +808,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
return ret;
}
+struct page_state {
+ unsigned long mask;
+ unsigned long res;
+ enum mf_action_page_type type;
+
+ /* Callback ->action() has to unlock the relevant page inside it. */
+ int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+ bool extra_pins)
+{
+ int count = page_count(p) - 1;
+
+ if (extra_pins)
+ count -= 1;
+
+ if (count > 0) {
+ pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ page_to_pfn(p), action_page_types[ps->type], count);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
* could be more sophisticated.
*/
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
{
unlock_page(p);
return MF_IGNORED;
@@ -820,9 +854,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
/*
* Page in unknown state. Do nothing.
*/
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+ pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -830,10 +864,11 @@ static int me_unknown(struct page *p, unsigned long pfn)
/*
* Clean (or cleaned) page cache page.
*/
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
int ret;
struct address_space *mapping;
+ bool extra_pins;
delete_from_lru_cache(p);
@@ -863,13 +898,23 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
}
/*
+ * The shmem page is kept in page cache instead of truncating
+ * so is expected to have an extra refcount after error-handling.
+ */
+ extra_pins = shmem_mapping(mapping);
+
+ /*
* Truncation is a bit tricky. Enable it per file system for now.
*
* Open: to take i_rwsem or not for this? Right now we don't.
*/
- ret = truncate_error_page(p, pfn, mapping);
+ ret = truncate_error_page(p, page_to_pfn(p), mapping);
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
out:
unlock_page(p);
+
return ret;
}
@@ -878,7 +923,7 @@ out:
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
struct address_space *mapping = page_mapping(p);
@@ -922,7 +967,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
mapping_set_error(mapping, -EIO);
}
- return me_pagecache_clean(p, pfn);
+ return me_pagecache_clean(ps, p);
}
/*
@@ -944,9 +989,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* Clean swap cache pages can be directly isolated. A later page fault will
* bring in the known good data from disk.
*/
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{
int ret;
+ bool extra_pins = false;
ClearPageDirty(p);
/* Trigger EIO in shmem: */
@@ -954,10 +1000,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
+
+ if (ret == MF_DELAYED)
+ extra_pins = true;
+
+ if (has_extra_refcount(ps, p, extra_pins))
+ ret = MF_FAILED;
+
return ret;
}
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
int ret;
@@ -965,6 +1018,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
+
+ if (has_extra_refcount(ps, p, false))
+ ret = MF_FAILED;
+
return ret;
}
@@ -974,7 +1031,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
*/
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
{
int res;
struct page *hpage = compound_head(p);
@@ -985,7 +1042,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
- res = truncate_error_page(hpage, pfn, mapping);
+ res = truncate_error_page(hpage, page_to_pfn(p), mapping);
unlock_page(hpage);
} else {
res = MF_FAILED;
@@ -1003,6 +1060,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
}
}
+ if (has_extra_refcount(ps, p, false))
+ res = MF_FAILED;
+
return res;
}
@@ -1028,14 +1088,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
-static struct page_state {
- unsigned long mask;
- unsigned long res;
- enum mf_action_page_type type;
-
- /* Callback ->action() has to unlock the relevant page inside it. */
- int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
@@ -1095,19 +1148,10 @@ static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn)
{
int result;
- int count;
/* page p should be unlocked after returning from ps->action(). */
- result = ps->action(p, pfn);
+ result = ps->action(ps, p);
- count = page_count(p) - 1;
- if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
- count--;
- if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
- pfn, action_page_types[ps->type], count);
- result = MF_FAILED;
- }
action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
@@ -1147,20 +1191,6 @@ static int __get_hwpoison_page(struct page *page)
if (!HWPoisonHandlable(head))
return -EBUSY;
- if (PageTransHuge(head)) {
- /*
- * Non anonymous thp exists only in allocation/free time. We
- * can't handle such a case correctly, so let's give it up.
- * This should be better than triggering BUG_ON when kernel
- * tries to touch the "partially handled" page.
- */
- if (!PageAnon(head)) {
- pr_err("Memory failure: %#lx: non anonymous thp\n",
- page_to_pfn(page));
- return 0;
- }
- }
-
if (get_page_unless_zero(head)) {
if (head == compound_head(page))
return 1;
@@ -1414,14 +1444,11 @@ static int identify_page_state(unsigned long pfn, struct page *p,
static int try_to_split_thp_page(struct page *page, const char *msg)
{
lock_page(page);
- if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ if (unlikely(split_huge_page(page))) {
unsigned long pfn = page_to_pfn(page);
unlock_page(page);
- if (!PageAnon(page))
- pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
- else
- pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+ pr_info("%s: %#lx: thp split failed\n", msg, pfn);
put_page(page);
return -EBUSY;
}
@@ -1708,6 +1735,20 @@ try_again:
}
if (PageTransHuge(hpage)) {
+ /*
+ * The flag must be set after the refcount is bumped
+ * otherwise it may race with THP split.
+ * And the flag can't be set in get_hwpoison_page() since
+ * it is called by soft offline too and it is just called
+ * for !MF_COUNT_INCREASE. So here seems to be the best
+ * place.
+ *
+ * Don't need care about the above error handling paths for
+ * get_hwpoison_page() since they handle either free page
+ * or unhandlable page. The refcount is bumped iff the
+ * page is a valid handlable page.
+ */
+ SetPageHasHWPoisoned(hpage);
if (try_to_split_thp_page(p, "Memory Failure") < 0) {
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
res = -EBUSY;
@@ -2109,14 +2150,14 @@ static int __soft_offline_page(struct page *page)
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
- pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
- pfn, msg_page[huge], ret, page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
+ pfn, msg_page[huge], ret, &page->flags);
if (ret > 0)
ret = -EBUSY;
}
} else {
- pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
- pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
+ pfn, msg_page[huge], page_count(page), &page->flags);
ret = -EBUSY;
}
return ret;
diff --git a/mm/memory.c b/mm/memory.c
index adf9b9ef8277..8f1de811a1dc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -433,35 +433,39 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
- spinlock_t *ptl;
- pgtable_t new = pte_alloc_one(mm);
- if (!new)
- return -ENOMEM;
+ spinlock_t *ptl = pmd_lock(mm, pmd);
- /*
- * Ensure all pte setup (eg. pte page lock and page clearing) are
- * visible before the pte is made visible to other CPUs by being
- * put into page tables.
- *
- * The other side of the story is the pointer chasing in the page
- * table walking code (when walking the page table without locking;
- * ie. most of the time). Fortunately, these data accesses consist
- * of a chain of data-dependent loads, meaning most CPUs (alpha
- * being the notable exception) will already guarantee loads are
- * seen in-order. See the alpha page table accessors for the
- * smp_rmb() barriers in page table walking code.
- */
- smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
-
- ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
- pmd_populate(mm, pmd, new);
- new = NULL;
+ /*
+ * Ensure all pte setup (eg. pte page lock and page clearing) are
+ * visible before the pte is made visible to other CPUs by being
+ * put into page tables.
+ *
+ * The other side of the story is the pointer chasing in the page
+ * table walking code (when walking the page table without locking;
+ * ie. most of the time). Fortunately, these data accesses consist
+ * of a chain of data-dependent loads, meaning most CPUs (alpha
+ * being the notable exception) will already guarantee loads are
+ * seen in-order. See the alpha page table accessors for the
+ * smp_rmb() barriers in page table walking code.
+ */
+ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
+ pmd_populate(mm, pmd, *pte);
+ *pte = NULL;
}
spin_unlock(ptl);
+}
+
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
+{
+ pgtable_t new = pte_alloc_one(mm);
+ if (!new)
+ return -ENOMEM;
+
+ pmd_install(mm, pmd, &new);
if (new)
pte_free(mm, new);
return 0;
@@ -473,10 +477,9 @@ int __pte_alloc_kernel(pmd_t *pmd)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ smp_wmb(); /* See comment in pmd_install() */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
}
@@ -990,7 +993,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
if (!new_page)
return NULL;
- if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
+ if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
put_page(new_page);
return NULL;
}
@@ -1333,16 +1336,8 @@ again:
struct page *page;
page = vm_normal_page(vma, addr, ptent);
- if (unlikely(details) && page) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping &&
- details->check_mapping != page_rmapping(page))
- continue;
- }
+ if (unlikely(zap_skip_check_mapping(details, page)))
+ continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
@@ -1375,17 +1370,8 @@ again:
is_device_exclusive_entry(entry)) {
struct page *page = pfn_swap_entry_to_page(entry);
- if (unlikely(details && details->check_mapping)) {
- /*
- * unmap_shared_mapping_pages() wants to
- * invalidate cache without truncating:
- * unmap shared but keep private pages.
- */
- if (details->check_mapping !=
- page_rmapping(page))
- continue;
- }
-
+ if (unlikely(zap_skip_check_mapping(details, page)))
+ continue;
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
@@ -2724,19 +2710,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
-static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
- pte_t *page_table, pte_t orig_pte)
+static inline int pte_unmap_same(struct vm_fault *vmf)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
- spinlock_t *ptl = pte_lockptr(mm, pmd);
+ spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(ptl);
- same = pte_same(*page_table, orig_pte);
+ same = pte_same(*vmf->pte, vmf->orig_pte);
spin_unlock(ptl);
}
#endif
- pte_unmap(page_table);
+ pte_unmap(vmf->pte);
+ vmf->pte = NULL;
return same;
}
@@ -3019,7 +3005,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
}
- if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+ if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
goto oom_free_new;
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
@@ -3321,20 +3307,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
+ pgoff_t first_index,
+ pgoff_t last_index,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
- vma_interval_tree_foreach(vma, root,
- details->first_index, details->last_index) {
-
+ vma_interval_tree_foreach(vma, root, first_index, last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
- zba = details->first_index;
+ zba = first_index;
if (zba < vba)
zba = vba;
- zea = details->last_index;
+ zea = last_index;
if (zea > vea)
zea = vea;
@@ -3360,18 +3346,22 @@ void unmap_mapping_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct zap_details details = { };
+ pgoff_t first_index;
+ pgoff_t last_index;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
- details.check_mapping = mapping;
- details.first_index = page->index;
- details.last_index = page->index + thp_nr_pages(page) - 1;
+ first_index = page->index;
+ last_index = page->index + thp_nr_pages(page) - 1;
+
+ details.zap_mapping = mapping;
details.single_page = page;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+ last_index, &details);
i_mmap_unlock_write(mapping);
}
@@ -3391,16 +3381,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t nr, bool even_cows)
{
struct zap_details details = { };
+ pgoff_t first_index = start;
+ pgoff_t last_index = start + nr - 1;
- details.check_mapping = even_cows ? NULL : mapping;
- details.first_index = start;
- details.last_index = start + nr - 1;
- if (details.last_index < details.first_index)
- details.last_index = ULONG_MAX;
+ details.zap_mapping = even_cows ? NULL : mapping;
+ if (last_index < first_index)
+ last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
- unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ unmap_mapping_range_tree(&mapping->i_mmap, first_index,
+ last_index, &details);
i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@@ -3488,7 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vm_fault_t ret = 0;
void *shadow = NULL;
- if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
+ if (!pte_unmap_same(vmf))
goto out;
entry = pte_to_swp_entry(vmf->orig_pte);
@@ -3539,7 +3530,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
- workingset_refault(page, shadow);
+ workingset_refault(page_folio(page),
+ shadow);
lru_cache_add(page);
@@ -3769,7 +3761,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
cgroup_throttle_swaprate(page, GFP_KERNEL);
@@ -3852,7 +3844,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
@@ -3907,6 +3898,15 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
return ret;
/*
+ * Just backoff if any subpage of a THP is corrupted otherwise
+ * the corrupted page may mapped by PMD silently to escape the
+ * check. This kind of THP just can be PTE mapped. Access to
+ * the corrupted subpage should trigger SIGBUS as expected.
+ */
+ if (unlikely(PageHasHWPoisoned(page)))
+ return ret;
+
+ /*
* Archs like ppc64 need additional space to store information
* related to pte entry. Use the preallocated table for that.
*/
@@ -3914,7 +3914,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@@ -4027,17 +4026,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}
- if (vmf->prealloc_pte) {
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (likely(pmd_none(*vmf->pmd))) {
- mm_inc_nr_ptes(vma->vm_mm);
- pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
- vmf->prealloc_pte = NULL;
- }
- spin_unlock(vmf->ptl);
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
+ if (vmf->prealloc_pte)
+ pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
+ else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
return VM_FAULT_OOM;
- }
}
/* See comment in handle_pte_fault() */
@@ -4146,7 +4138,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
- smp_wmb(); /* See comment in __pte_alloc() */
}
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
@@ -4193,7 +4184,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
+ if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
+ GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
@@ -4258,7 +4250,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
* The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * return value. See filemap_fault() and __folio_lock_or_retry().
* If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
@@ -4499,7 +4491,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* concurrent faults).
*
* The mmap_lock may have been released depending on flags and our return value.
- * See filemap_fault() and __lock_page_or_retry().
+ * See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
@@ -4603,7 +4595,7 @@ unlock:
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * return value. See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@@ -4759,7 +4751,7 @@ static inline void mm_account_fault(struct pt_regs *regs,
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
+ * return value. See filemap_fault() and __folio_lock_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
@@ -4820,13 +4812,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&mm->page_table_lock);
- if (pgd_present(*pgd)) /* Another has populated it */
+ if (pgd_present(*pgd)) { /* Another has populated it */
p4d_free(mm, new);
- else
+ } else {
+ smp_wmb(); /* See comment in pmd_install() */
pgd_populate(mm, pgd, new);
+ }
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -4843,11 +4835,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
spin_lock(&mm->page_table_lock);
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
+ smp_wmb(); /* See comment in pmd_install() */
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
@@ -4868,14 +4859,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
if (!new)
return -ENOMEM;
- smp_wmb(); /* See comment in __pte_alloc */
-
ptl = pud_lock(mm, pud);
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
+ smp_wmb(); /* See comment in pmd_install() */
pud_populate(mm, pud, new);
- } else /* Another has populated it */
+ } else { /* Another has populated it */
pmd_free(mm, new);
+ }
spin_unlock(ptl);
return 0;
}
@@ -5256,7 +5247,7 @@ void __might_fault(const char *file, int line)
return;
if (pagefault_disabled())
return;
- __might_sleep(file, line, 0);
+ __might_sleep(file, line);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
@@ -5412,7 +5403,6 @@ long copy_huge_page_from_user(struct page *dst_page,
unsigned int pages_per_huge_page,
bool allow_pagefault)
{
- void *src = (void *)usr_src;
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
@@ -5425,8 +5415,7 @@ long copy_huge_page_from_user(struct page *dst_page,
else
page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
- (const void __user *)(src + i * PAGE_SIZE),
- PAGE_SIZE);
+ usr_src + i * PAGE_SIZE, PAGE_SIZE);
if (allow_pagefault)
kunmap(subpage);
else
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9fd0be32a281..852041f6be41 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -21,7 +21,6 @@
#include <linux/memory.h>
#include <linux/memremap.h>
#include <linux/memory_hotplug.h>
-#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/delay.h>
@@ -57,7 +56,7 @@ enum {
ONLINE_POLICY_AUTO_MOVABLE,
};
-const char *online_policy_to_str[] = {
+static const char * const online_policy_to_str[] = {
[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
};
@@ -220,7 +219,6 @@ static void release_memory_resource(struct resource *res)
kfree(res);
}
-#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
const char *reason)
{
@@ -586,10 +584,6 @@ void generic_online_page(struct page *page, unsigned int order)
debug_pagealloc_map_pages(page, 1 << order);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
-#ifdef CONFIG_HIGHMEM
- if (PageHighMem(page))
- totalhigh_pages_add(1UL << order);
-#endif
}
EXPORT_SYMBOL_GPL(generic_online_page);
@@ -626,16 +620,11 @@ static void node_states_check_changes_online(unsigned long nr_pages,
arg->status_change_nid = NUMA_NO_NODE;
arg->status_change_nid_normal = NUMA_NO_NODE;
- arg->status_change_nid_high = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
-#ifdef CONFIG_HIGHMEM
- if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
- arg->status_change_nid_high = nid;
-#endif
}
static void node_states_set_node(int node, struct memory_notify *arg)
@@ -643,9 +632,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_set_state(node, N_NORMAL_MEMORY);
- if (arg->status_change_nid_high >= 0)
- node_set_state(node, N_HIGH_MEMORY);
-
if (arg->status_change_nid >= 0)
node_set_state(node, N_MEMORY);
}
@@ -1163,7 +1149,6 @@ failed_addition:
mem_hotplug_done();
return ret;
}
-#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
static void reset_node_present_pages(pg_data_t *pgdat)
{
@@ -1357,6 +1342,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
+ enum memblock_flags memblock_flags = MEMBLOCK_NONE;
struct vmem_altmap mhp_altmap = {};
struct memory_group *group = NULL;
u64 start, size;
@@ -1384,8 +1370,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
mem_hotplug_begin();
- if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
- memblock_add_node(start, size, nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
+ memblock_flags = MEMBLOCK_DRIVER_MANAGED;
+ ret = memblock_add_node(start, size, nid, memblock_flags);
+ if (ret)
+ goto error_mem_hotplug_end;
+ }
ret = __try_online_node(nid, false);
if (ret < 0)
@@ -1458,6 +1449,7 @@ error:
rollback_node_hotadd(nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
+error_mem_hotplug_end:
mem_hotplug_done();
return ret;
}
@@ -1803,7 +1795,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
arg->status_change_nid = NUMA_NO_NODE;
arg->status_change_nid_normal = NUMA_NO_NODE;
- arg->status_change_nid_high = NUMA_NO_NODE;
/*
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
@@ -1818,24 +1809,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
-#ifdef CONFIG_HIGHMEM
/*
- * node_states[N_HIGH_MEMORY] contains nodes which
- * have normal memory or high memory.
- * Here we add the present_pages belonging to ZONE_HIGHMEM.
- * If the zone is within the range of [0..ZONE_HIGHMEM), and
- * we determine that the zones in that range become empty,
- * we need to clear the node for N_HIGH_MEMORY.
- */
- present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
- if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
- arg->status_change_nid_high = zone_to_nid(zone);
-#endif
-
- /*
- * We have accounted the pages from [0..ZONE_NORMAL), and
- * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
- * as well.
+ * We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
+ * does not apply as we don't support 32bit.
* Here we count the possible pages from ZONE_MOVABLE.
* If after having accounted all the pages, we see that the nr_pages
* to be offlined is over or equal to the accounted pages,
@@ -1853,9 +1829,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
- if (arg->status_change_nid_high >= 0)
- node_clear_state(node, N_HIGH_MEMORY);
-
if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
@@ -2204,7 +2177,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
arch_remove_memory(start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
- memblock_free(start, size);
+ memblock_phys_free(start, size);
memblock_remove(start, size);
}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1592b081c58e..10e9c87260ed 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -856,16 +856,6 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
goto out;
}
- if (flags & MPOL_F_NUMA_BALANCING) {
- if (new && new->mode == MPOL_BIND) {
- new->flags |= (MPOL_F_MOF | MPOL_F_MORON);
- } else {
- ret = -EINVAL;
- mpol_put(new);
- goto out;
- }
- }
-
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
mpol_put(new);
@@ -1458,7 +1448,11 @@ static inline int sanitize_mpol_flags(int *mode, unsigned short *flags)
return -EINVAL;
if ((*flags & MPOL_F_STATIC_NODES) && (*flags & MPOL_F_RELATIVE_NODES))
return -EINVAL;
-
+ if (*flags & MPOL_F_NUMA_BALANCING) {
+ if (*mode != MPOL_BIND)
+ return -EINVAL;
+ *flags |= (MPOL_F_MOF | MPOL_F_MORON);
+ }
return 0;
}
@@ -2202,6 +2196,98 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
}
EXPORT_SYMBOL(alloc_pages);
+struct folio *folio_alloc(gfp_t gfp, unsigned order)
+{
+ struct page *page = alloc_pages(gfp | __GFP_COMP, order);
+
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ return (struct folio *)page;
+}
+EXPORT_SYMBOL(folio_alloc);
+
+static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ int nodes;
+ unsigned long nr_pages_per_node;
+ int delta;
+ int i;
+ unsigned long nr_allocated;
+ unsigned long total_allocated = 0;
+
+ nodes = nodes_weight(pol->nodes);
+ nr_pages_per_node = nr_pages / nodes;
+ delta = nr_pages - nodes * nr_pages_per_node;
+
+ for (i = 0; i < nodes; i++) {
+ if (delta) {
+ nr_allocated = __alloc_pages_bulk(gfp,
+ interleave_nodes(pol), NULL,
+ nr_pages_per_node + 1, NULL,
+ page_array);
+ delta--;
+ } else {
+ nr_allocated = __alloc_pages_bulk(gfp,
+ interleave_nodes(pol), NULL,
+ nr_pages_per_node, NULL, page_array);
+ }
+
+ page_array += nr_allocated;
+ total_allocated += nr_allocated;
+ }
+
+ return total_allocated;
+}
+
+static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+ struct mempolicy *pol, unsigned long nr_pages,
+ struct page **page_array)
+{
+ gfp_t preferred_gfp;
+ unsigned long nr_allocated = 0;
+
+ preferred_gfp = gfp | __GFP_NOWARN;
+ preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
+
+ nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
+ nr_pages, NULL, page_array);
+
+ if (nr_allocated < nr_pages)
+ nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
+ nr_pages - nr_allocated, NULL,
+ page_array + nr_allocated);
+ return nr_allocated;
+}
+
+/* alloc pages bulk and mempolicy should be considered at the
+ * same time in some situation such as vmalloc.
+ *
+ * It can accelerate memory allocation especially interleaving
+ * allocate memory.
+ */
+unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
+ unsigned long nr_pages, struct page **page_array)
+{
+ struct mempolicy *pol = &default_policy;
+
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ if (pol->mode == MPOL_INTERLEAVE)
+ return alloc_pages_bulk_array_interleave(gfp, pol,
+ nr_pages, page_array);
+
+ if (pol->mode == MPOL_PREFERRED_MANY)
+ return alloc_pages_bulk_array_preferred_many(gfp,
+ numa_node_id(), pol, nr_pages, page_array);
+
+ return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
+ policy_nodemask(gfp, pol), nr_pages, NULL,
+ page_array);
+}
+
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
@@ -2981,64 +3067,3 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
-
-bool numa_demotion_enabled = false;
-
-#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return sysfs_emit(buf, "%s\n",
- numa_demotion_enabled? "true" : "false");
-}
-
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
- numa_demotion_enabled = true;
- else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
- numa_demotion_enabled = false;
- else
- return -EINVAL;
-
- return count;
-}
-
-static struct kobj_attribute numa_demotion_enabled_attr =
- __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
- numa_demotion_enabled_store);
-
-static struct attribute *numa_attrs[] = {
- &numa_demotion_enabled_attr.attr,
- NULL,
-};
-
-static const struct attribute_group numa_attr_group = {
- .attrs = numa_attrs,
-};
-
-static int __init numa_init_sysfs(void)
-{
- int err;
- struct kobject *numa_kobj;
-
- numa_kobj = kobject_create_and_add("numa", mm_kobj);
- if (!numa_kobj) {
- pr_err("failed to create numa kobject\n");
- return -ENOMEM;
- }
- err = sysfs_create_group(numa_kobj, &numa_attr_group);
- if (err) {
- pr_err("failed to register numa group\n");
- goto delete_obj;
- }
- return 0;
-
-delete_obj:
- kobject_put(numa_kobj);
- return err;
-}
-subsys_initcall(numa_init_sysfs);
-#endif
diff --git a/mm/mempool.c b/mm/mempool.c
index 0b8afbec3e35..b933d0fc21b8 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -17,7 +17,6 @@
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
-#include <linux/blkdev.h>
#include <linux/writeback.h>
#include "slab.h"
diff --git a/mm/memremap.c b/mm/memremap.c
index ed593bf87109..5a66a71ab591 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -505,7 +505,7 @@ void free_devmap_managed_page(struct page *page)
__ClearPageWaiters(page);
- mem_cgroup_uncharge(page);
+ mem_cgroup_uncharge(page_folio(page));
/*
* When a device_private page is freed, the page->mapping field
diff --git a/mm/migrate.c b/mm/migrate.c
index a6a7743ee98f..a11e948593df 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -364,7 +364,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
*/
expected_count += is_device_private_page(page);
if (mapping)
- expected_count += thp_nr_pages(page) + page_has_private(page);
+ expected_count += compound_nr(page) + page_has_private(page);
return expected_count;
}
@@ -377,74 +377,75 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
* 2 for pages with a mapping
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
-int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page, int extra_count)
+int folio_migrate_mapping(struct address_space *mapping,
+ struct folio *newfolio, struct folio *folio, int extra_count)
{
- XA_STATE(xas, &mapping->i_pages, page_index(page));
+ XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = expected_page_refs(mapping, page) + extra_count;
- int nr = thp_nr_pages(page);
+ int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
+ long nr = folio_nr_pages(folio);
if (!mapping) {
/* Anonymous page without mapping */
- if (page_count(page) != expected_count)
+ if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
/* No turning back from here */
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- if (PageSwapBacked(page))
- __SetPageSwapBacked(newpage);
+ newfolio->index = folio->index;
+ newfolio->mapping = folio->mapping;
+ if (folio_test_swapbacked(folio))
+ __folio_set_swapbacked(newfolio);
return MIGRATEPAGE_SUCCESS;
}
- oldzone = page_zone(page);
- newzone = page_zone(newpage);
+ oldzone = folio_zone(folio);
+ newzone = folio_zone(newfolio);
xas_lock_irq(&xas);
- if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ if (folio_ref_count(folio) != expected_count ||
+ xas_load(&xas) != folio) {
xas_unlock_irq(&xas);
return -EAGAIN;
}
- if (!page_ref_freeze(page, expected_count)) {
+ if (!folio_ref_freeze(folio, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
}
/*
- * Now we know that no one else is looking at the page:
+ * Now we know that no one else is looking at the folio:
* no turning back from here.
*/
- newpage->index = page->index;
- newpage->mapping = page->mapping;
- page_ref_add(newpage, nr); /* add cache reference */
- if (PageSwapBacked(page)) {
- __SetPageSwapBacked(newpage);
- if (PageSwapCache(page)) {
- SetPageSwapCache(newpage);
- set_page_private(newpage, page_private(page));
+ newfolio->index = folio->index;
+ newfolio->mapping = folio->mapping;
+ folio_ref_add(newfolio, nr); /* add cache reference */
+ if (folio_test_swapbacked(folio)) {
+ __folio_set_swapbacked(newfolio);
+ if (folio_test_swapcache(folio)) {
+ folio_set_swapcache(newfolio);
+ newfolio->private = folio_get_private(folio);
}
} else {
- VM_BUG_ON_PAGE(PageSwapCache(page), page);
+ VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
}
/* Move dirty while page refs frozen and newpage not yet exposed */
- dirty = PageDirty(page);
+ dirty = folio_test_dirty(folio);
if (dirty) {
- ClearPageDirty(page);
- SetPageDirty(newpage);
+ folio_clear_dirty(folio);
+ folio_set_dirty(newfolio);
}
- xas_store(&xas, newpage);
- if (PageTransHuge(page)) {
+ xas_store(&xas, newfolio);
+ if (nr > 1) {
int i;
for (i = 1; i < nr; i++) {
xas_next(&xas);
- xas_store(&xas, newpage);
+ xas_store(&xas, newfolio);
}
}
@@ -453,7 +454,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - nr);
+ folio_ref_unfreeze(folio, expected_count - nr);
xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -472,18 +473,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
struct lruvec *old_lruvec, *new_lruvec;
struct mem_cgroup *memcg;
- memcg = page_memcg(page);
+ memcg = folio_memcg(folio);
old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
__mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
__mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
- if (PageSwapBacked(page) && !PageSwapCache(page)) {
+ if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
__mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
__mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
#ifdef CONFIG_SWAP
- if (PageSwapCache(page)) {
+ if (folio_test_swapcache(folio)) {
__mod_lruvec_state(old_lruvec, NR_SWAPCACHE, -nr);
__mod_lruvec_state(new_lruvec, NR_SWAPCACHE, nr);
}
@@ -499,11 +500,11 @@ int migrate_page_move_mapping(struct address_space *mapping,
return MIGRATEPAGE_SUCCESS;
}
-EXPORT_SYMBOL(migrate_page_move_mapping);
+EXPORT_SYMBOL(folio_migrate_mapping);
/*
* The expected number of remaining references is the same as that
- * of migrate_page_move_mapping().
+ * of folio_migrate_mapping().
*/
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
@@ -538,91 +539,87 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
}
/*
- * Copy the page to its new location
+ * Copy the flags and some other ancillary information
*/
-void migrate_page_states(struct page *newpage, struct page *page)
+void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
{
int cpupid;
- if (PageError(page))
- SetPageError(newpage);
- if (PageReferenced(page))
- SetPageReferenced(newpage);
- if (PageUptodate(page))
- SetPageUptodate(newpage);
- if (TestClearPageActive(page)) {
- VM_BUG_ON_PAGE(PageUnevictable(page), page);
- SetPageActive(newpage);
- } else if (TestClearPageUnevictable(page))
- SetPageUnevictable(newpage);
- if (PageWorkingset(page))
- SetPageWorkingset(newpage);
- if (PageChecked(page))
- SetPageChecked(newpage);
- if (PageMappedToDisk(page))
- SetPageMappedToDisk(newpage);
-
- /* Move dirty on pages not done by migrate_page_move_mapping() */
- if (PageDirty(page))
- SetPageDirty(newpage);
-
- if (page_is_young(page))
- set_page_young(newpage);
- if (page_is_idle(page))
- set_page_idle(newpage);
+ if (folio_test_error(folio))
+ folio_set_error(newfolio);
+ if (folio_test_referenced(folio))
+ folio_set_referenced(newfolio);
+ if (folio_test_uptodate(folio))
+ folio_mark_uptodate(newfolio);
+ if (folio_test_clear_active(folio)) {
+ VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+ folio_set_active(newfolio);
+ } else if (folio_test_clear_unevictable(folio))
+ folio_set_unevictable(newfolio);
+ if (folio_test_workingset(folio))
+ folio_set_workingset(newfolio);
+ if (folio_test_checked(folio))
+ folio_set_checked(newfolio);
+ if (folio_test_mappedtodisk(folio))
+ folio_set_mappedtodisk(newfolio);
+
+ /* Move dirty on pages not done by folio_migrate_mapping() */
+ if (folio_test_dirty(folio))
+ folio_set_dirty(newfolio);
+
+ if (folio_test_young(folio))
+ folio_set_young(newfolio);
+ if (folio_test_idle(folio))
+ folio_set_idle(newfolio);
/*
* Copy NUMA information to the new page, to prevent over-eager
* future migrations of this same page.
*/
- cpupid = page_cpupid_xchg_last(page, -1);
- page_cpupid_xchg_last(newpage, cpupid);
+ cpupid = page_cpupid_xchg_last(&folio->page, -1);
+ page_cpupid_xchg_last(&newfolio->page, cpupid);
- ksm_migrate_page(newpage, page);
+ folio_migrate_ksm(newfolio, folio);
/*
* Please do not reorder this without considering how mm/ksm.c's
* get_ksm_page() depends upon ksm_migrate_page() and PageSwapCache().
*/
- if (PageSwapCache(page))
- ClearPageSwapCache(page);
- ClearPagePrivate(page);
+ if (folio_test_swapcache(folio))
+ folio_clear_swapcache(folio);
+ folio_clear_private(folio);
/* page->private contains hugetlb specific flags */
- if (!PageHuge(page))
- set_page_private(page, 0);
+ if (!folio_test_hugetlb(folio))
+ folio->private = NULL;
/*
* If any waiters have accumulated on the new page then
* wake them up.
*/
- if (PageWriteback(newpage))
- end_page_writeback(newpage);
+ if (folio_test_writeback(newfolio))
+ folio_end_writeback(newfolio);
/*
* PG_readahead shares the same bit with PG_reclaim. The above
* end_page_writeback() may clear PG_readahead mistakenly, so set the
* bit after that.
*/
- if (PageReadahead(page))
- SetPageReadahead(newpage);
+ if (folio_test_readahead(folio))
+ folio_set_readahead(newfolio);
- copy_page_owner(page, newpage);
+ folio_copy_owner(newfolio, folio);
- if (!PageHuge(page))
- mem_cgroup_migrate(page, newpage);
+ if (!folio_test_hugetlb(folio))
+ mem_cgroup_migrate(folio, newfolio);
}
-EXPORT_SYMBOL(migrate_page_states);
+EXPORT_SYMBOL(folio_migrate_flags);
-void migrate_page_copy(struct page *newpage, struct page *page)
+void folio_migrate_copy(struct folio *newfolio, struct folio *folio)
{
- if (PageHuge(page) || PageTransHuge(page))
- copy_huge_page(newpage, page);
- else
- copy_highpage(newpage, page);
-
- migrate_page_states(newpage, page);
+ folio_copy(newfolio, folio);
+ folio_migrate_flags(newfolio, folio);
}
-EXPORT_SYMBOL(migrate_page_copy);
+EXPORT_SYMBOL(folio_migrate_copy);
/************************************************************
* Migration functions
@@ -638,19 +635,21 @@ int migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page,
enum migrate_mode mode)
{
+ struct folio *newfolio = page_folio(newpage);
+ struct folio *folio = page_folio(page);
int rc;
- BUG_ON(PageWriteback(page)); /* Writeback must be complete */
+ BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+ rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(newfolio, folio);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(newfolio, folio);
return MIGRATEPAGE_SUCCESS;
}
EXPORT_SYMBOL(migrate_page);
@@ -2468,7 +2467,7 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
* @page: struct page to check
*
* Pinned pages cannot be migrated. This is the same test as in
- * migrate_page_move_mapping(), except that here we allow migration of a
+ * folio_migrate_mapping(), except that here we allow migration of a
* ZONE_DEVICE page.
*/
static bool migrate_vma_check_page(struct page *page)
@@ -2846,7 +2845,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (unlikely(anon_vma_prepare(vma)))
goto abort;
- if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
goto abort;
/*
@@ -3066,7 +3065,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
EXPORT_SYMBOL(migrate_vma_finalize);
#endif /* CONFIG_DEVICE_PRIVATE */
-#if defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_HOTPLUG_CPU)
/* Disable reclaim-based migration. */
static void __disable_all_migrate_targets(void)
{
@@ -3209,25 +3208,6 @@ static void set_migration_target_nodes(void)
}
/*
- * React to hotplug events that might affect the migration targets
- * like events that online or offline NUMA nodes.
- *
- * The ordering is also currently dependent on which nodes have
- * CPUs. That means we need CPU on/offline notification too.
- */
-static int migration_online_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-static int migration_offline_cpu(unsigned int cpu)
-{
- set_migration_target_nodes();
- return 0;
-}
-
-/*
* This leaves migrate-on-reclaim transiently disabled between
* the MEM_GOING_OFFLINE and MEM_OFFLINE events. This runs
* whether reclaim-based migration is enabled or not, which
@@ -3239,8 +3219,18 @@ static int migration_offline_cpu(unsigned int cpu)
* set_migration_target_nodes().
*/
static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
- unsigned long action, void *arg)
+ unsigned long action, void *_arg)
{
+ struct memory_notify *arg = _arg;
+
+ /*
+ * Only update the node migration order when a node is
+ * changing status, like online->offline. This avoids
+ * the overhead of synchronize_rcu() in most cases.
+ */
+ if (arg->status_change_nid < 0)
+ return notifier_from_errno(0);
+
switch (action) {
case MEM_GOING_OFFLINE:
/*
@@ -3274,13 +3264,31 @@ static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
return notifier_from_errno(0);
}
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs. That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+ set_migration_target_nodes();
+ return 0;
+}
+
static int __init migrate_on_reclaim_init(void)
{
int ret;
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
- migration_online_cpu,
- migration_offline_cpu);
+ ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
+ NULL, migration_offline_cpu);
/*
* In the unlikely case that this fails, the automatic
* migration targets may become suboptimal for nodes
@@ -3288,9 +3296,73 @@ static int __init migrate_on_reclaim_init(void)
* rare case, do not bother trying to do anything special.
*/
WARN_ON(ret < 0);
+ ret = cpuhp_setup_state(CPUHP_AP_MM_DEMOTION_ONLINE, "mm/demotion:online",
+ migration_online_cpu, NULL);
+ WARN_ON(ret < 0);
hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
return 0;
}
late_initcall(migrate_on_reclaim_init);
-#endif /* CONFIG_MEMORY_HOTPLUG */
+#endif /* CONFIG_HOTPLUG_CPU */
+
+bool numa_demotion_enabled = false;
+
+#ifdef CONFIG_SYSFS
+static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%s\n",
+ numa_demotion_enabled ? "true" : "false");
+}
+
+static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
+ numa_demotion_enabled = true;
+ else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
+ numa_demotion_enabled = false;
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static struct kobj_attribute numa_demotion_enabled_attr =
+ __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
+ numa_demotion_enabled_store);
+
+static struct attribute *numa_attrs[] = {
+ &numa_demotion_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group numa_attr_group = {
+ .attrs = numa_attrs,
+};
+
+static int __init numa_init_sysfs(void)
+{
+ int err;
+ struct kobject *numa_kobj;
+
+ numa_kobj = kobject_create_and_add("numa", mm_kobj);
+ if (!numa_kobj) {
+ pr_err("failed to create numa kobject\n");
+ return -ENOMEM;
+ }
+ err = sysfs_create_group(numa_kobj, &numa_attr_group);
+ if (err) {
+ pr_err("failed to register numa group\n");
+ goto delete_obj;
+ }
+ return 0;
+
+delete_obj:
+ kobject_put(numa_kobj);
+ return err;
+}
+subsys_initcall(numa_init_sysfs);
+#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 16d2ee160d43..e263d62ae2d0 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
/* Phase 1: page isolation */
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
+ struct folio *folio = page_folio(page);
if (TestClearPageMlocked(page)) {
/*
@@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* so we can spare the get_page() here.
*/
if (TestClearPageLRU(page)) {
- lruvec = relock_page_lruvec_irq(page, lruvec);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
del_page_from_lru_list(page, lruvec);
continue;
} else
diff --git a/mm/mmap.c b/mm/mmap.c
index 88dcc5c25225..bfb0ea164a90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1599,7 +1599,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
- struct ucounts *ucounts = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1615,7 +1614,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
- &ucounts, HUGETLB_ANONHUGE_INODE,
+ HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
@@ -3332,7 +3331,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
- mm->total_vm += npages;
+ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
if (is_exec_mapping(flags))
mm->exec_vm += npages;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 883e2cc85cad..e552f5e0ccbd 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
error = -ENOMEM;
if (!vma)
goto out;
- prev = vma->vm_prev;
+
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
@@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
goto out;
}
}
+
if (start > vma->vm_start)
prev = vma;
+ else
+ prev = vma->vm_prev;
for (nstart = start ; ; ) {
unsigned long mask_off_old_flags;
diff --git a/mm/mremap.c b/mm/mremap.c
index badfe17ade1f..002eec83e91e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
+ if (is_vm_hugetlb_page(vma))
+ return move_hugetlb_page_tables(vma, new_vma, old_addr,
+ new_addr, len);
+
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
@@ -565,6 +569,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
+ long to_account = new_len - old_len;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
@@ -583,6 +588,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
+ if (unlikely(flags & MREMAP_DONTUNMAP))
+ to_account = new_len;
+
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
err = vma->vm_ops->may_split(vma, old_addr);
@@ -604,8 +612,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (err)
return err;
- if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
- if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
+ if (vm_flags & VM_ACCOUNT) {
+ if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
return -ENOMEM;
}
@@ -613,8 +621,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) {
- if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
- vm_unacct_memory(new_len >> PAGE_SHIFT);
+ if (vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(to_account >> PAGE_SHIFT);
return -ENOMEM;
}
@@ -642,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mremap_userfaultfd_prep(new_vma, uf);
}
+ if (is_vm_hugetlb_page(vma)) {
+ clear_vma_resv_huge_pages(vma);
+ }
+
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
vma->vm_flags &= ~VM_ACCOUNT;
@@ -708,8 +720,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long flags,
- unsigned long *p)
+ unsigned long old_len, unsigned long new_len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@@ -736,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);
- if (is_vm_hugetlb_page(vma))
- return ERR_PTR(-EINVAL);
-
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
return ERR_PTR(-EFAULT);
@@ -768,13 +776,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
- if (vma->vm_flags & VM_ACCOUNT) {
- unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return ERR_PTR(-ENOMEM);
- *p = charged;
- }
-
return vma;
}
@@ -787,7 +788,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
- unsigned long charged = 0;
unsigned long map_flags = 0;
if (offset_in_page(new_addr))
@@ -830,7 +830,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}
- vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -853,7 +853,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
if (IS_ERR_VALUE(ret))
- goto out1;
+ goto out;
/* We got a new mapping */
if (!(flags & MREMAP_FIXED))
@@ -862,12 +862,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
- if (!(offset_in_page(ret)))
- goto out;
-
-out1:
- vm_unacct_memory(charged);
-
out:
return ret;
}
@@ -899,7 +893,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
- unsigned long charged = 0;
bool locked = false;
bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
@@ -949,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (mmap_write_lock_killable(current->mm))
return -EINTR;
+ vma = find_vma(mm, addr);
+ if (!vma || vma->vm_start > addr) {
+ ret = EFAULT;
+ goto out;
+ }
+
+ if (is_vm_hugetlb_page(vma)) {
+ struct hstate *h __maybe_unused = hstate_vma(vma);
+
+ old_len = ALIGN(old_len, huge_page_size(h));
+ new_len = ALIGN(new_len, huge_page_size(h));
+
+ /* addrs must be huge page aligned */
+ if (addr & ~huge_page_mask(h))
+ goto out;
+ if (new_addr & ~huge_page_mask(h))
+ goto out;
+
+ /*
+ * Don't allow remap expansion, because the underlying hugetlb
+ * reservation is not yet capable to handle split reservation.
+ */
+ if (new_len > old_len)
+ goto out;
+ }
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
ret = mremap_to(addr, old_len, new_addr, new_len,
@@ -981,7 +999,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -992,10 +1010,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
- int pages = (new_len - old_len) >> PAGE_SHIFT;
+ long pages = (new_len - old_len) >> PAGE_SHIFT;
+
+ if (vma->vm_flags & VM_ACCOUNT) {
+ if (security_vm_enough_memory_mm(mm, pages)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
+ vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
}
@@ -1034,10 +1060,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
&locked, flags, &uf, &uf_unmap);
}
out:
- if (offset_in_page(ret)) {
- vm_unacct_memory(charged);
+ if (offset_in_page(ret))
locked = false;
- }
if (downgraded)
mmap_read_unlock(current->mm);
else
diff --git a/mm/nommu.c b/mm/nommu.c
index 02d2427b8f9e..55a9e48a7a02 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -27,7 +27,6 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/compiler.h>
#include <linux/mount.h>
@@ -1639,12 +1638,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
- unsigned long len, unsigned long pgoff, unsigned long flags)
-{
- return -ENOMEM;
-}
-
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 70d399d5817e..1ddabefcfb5a 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -641,6 +641,8 @@ done:
static int oom_reaper(void *unused)
{
+ set_freezable();
+
while (true) {
struct task_struct *tsk = NULL;
@@ -787,9 +789,9 @@ static inline bool __task_will_free_mem(struct task_struct *task)
struct signal_struct *sig = task->signal;
/*
- * A coredumping process may sleep for an extended period in exit_mm(),
- * so the oom killer cannot assume that the process will promptly exit
- * and release memory.
+ * A coredumping process may sleep for an extended period in
+ * coredump_task_exit(), so the oom killer cannot assume that
+ * the process will promptly exit and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)
return false;
@@ -1120,27 +1122,24 @@ bool out_of_memory(struct oom_control *oc)
}
/*
- * The pagefault handler calls here because it is out of memory, so kill a
- * memory-hogging task. If oom_lock is held by somebody else, a parallel oom
- * killing is already in progress so do nothing.
+ * The pagefault handler calls here because some allocation has failed. We have
+ * to take care of the memcg OOM here because this is the only safe context without
+ * any locks held but let the oom killer triggered from the allocation context care
+ * about the global OOM.
*/
void pagefault_out_of_memory(void)
{
- struct oom_control oc = {
- .zonelist = NULL,
- .nodemask = NULL,
- .memcg = NULL,
- .gfp_mask = 0,
- .order = 0,
- };
+ static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
if (mem_cgroup_oom_synchronize(true))
return;
- if (!mutex_trylock(&oom_lock))
+ if (fatal_signal_pending(current))
return;
- out_of_memory(&oc);
- mutex_unlock(&oom_lock);
+
+ if (__ratelimit(&pfoom_rs))
+ pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
}
SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
@@ -1150,7 +1149,7 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
struct task_struct *task;
struct task_struct *p;
unsigned int f_flags;
- bool reap = true;
+ bool reap = false;
long ret = 0;
if (flags)
@@ -1170,15 +1169,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
goto put_task;
}
- mm = p->mm;
- mmgrab(mm);
-
- /* If the work has been done already, just exit with success */
- if (test_bit(MMF_OOM_SKIP, &mm->flags))
- reap = false;
- else if (!task_will_free_mem(p)) {
- reap = false;
- ret = -EINVAL;
+ if (mmget_not_zero(p->mm)) {
+ mm = p->mm;
+ if (task_will_free_mem(p))
+ reap = true;
+ else {
+ /* Error only if the work has not been done already */
+ if (!test_bit(MMF_OOM_SKIP, &mm->flags))
+ ret = -EINVAL;
+ }
}
task_unlock(p);
@@ -1194,7 +1193,8 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
mmap_read_unlock(mm);
drop_mm:
- mmdrop(mm);
+ if (mm)
+ mmput(mm);
put_task:
put_task_struct(task);
return ret;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 4812a17b288c..2d498bb62248 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time)
return cur_time;
}
-static void wb_domain_writeout_inc(struct wb_domain *dom,
+static void wb_domain_writeout_add(struct wb_domain *dom,
struct fprop_local_percpu *completions,
- unsigned int max_prop_frac)
+ unsigned int max_prop_frac, long nr)
{
- __fprop_inc_percpu_max(&dom->completions, completions,
- max_prop_frac);
+ __fprop_add_percpu_max(&dom->completions, completions,
+ max_prop_frac, nr);
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
@@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
/*
* Increment @wb's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
+ * completion count. Called from __folio_end_writeback().
*/
-static inline void __wb_writeout_inc(struct bdi_writeback *wb)
+static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{
struct wb_domain *cgdom;
- inc_wb_stat(wb, WB_WRITTEN);
- wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
- wb->bdi->max_prop_frac);
+ wb_stat_mod(wb, WB_WRITTEN, nr);
+ wb_domain_writeout_add(&global_wb_domain, &wb->completions,
+ wb->bdi->max_prop_frac, nr);
cgdom = mem_cgroup_wb_domain(wb);
if (cgdom)
- wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
- wb->bdi->max_prop_frac);
+ wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
+ wb->bdi->max_prop_frac, nr);
}
void wb_writeout_inc(struct bdi_writeback *wb)
@@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb)
unsigned long flags;
local_irq_save(flags);
- __wb_writeout_inc(wb);
+ __wb_writeout_add(wb, 1);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);
@@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
* write_bandwidth = ---------------------------------------------------
* period
*
- * @written may have decreased due to account_page_redirty().
+ * @written may have decreased due to folio_account_redirty().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
@@ -2366,8 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
ret = generic_writepages(mapping, wbc);
if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
break;
- cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+
+ /*
+ * Lacking an allocation context or the locality or writeback
+ * state of any of the inode's pages, throttle based on
+ * writeback activity on the local node. It's as good a
+ * guess as any.
+ */
+ reclaim_throttle(NODE_DATA(numa_node_id()),
+ VMSCAN_THROTTLE_WRITEBACK);
}
/*
* Usually few pages are written by now from those we've just submitted
@@ -2381,44 +2388,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
}
/**
- * write_one_page - write out a single page and wait on I/O
- * @page: the page to write
+ * folio_write_one - write out a single folio and wait on I/O.
+ * @folio: The folio to write.
*
- * The page must be locked by the caller and will be unlocked upon return.
+ * The folio must be locked by the caller and will be unlocked upon return.
*
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
* function returns.
*
* Return: %0 on success, negative error code otherwise
*/
-int write_one_page(struct page *page)
+int folio_write_one(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
int ret = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
- .nr_to_write = 1,
+ .nr_to_write = folio_nr_pages(folio),
};
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
- if (clear_page_dirty_for_io(page)) {
- get_page(page);
- ret = mapping->a_ops->writepage(page, &wbc);
+ if (folio_clear_dirty_for_io(folio)) {
+ folio_get(folio);
+ ret = mapping->a_ops->writepage(&folio->page, &wbc);
if (ret == 0)
- wait_on_page_writeback(page);
- put_page(page);
+ folio_wait_writeback(folio);
+ folio_put(folio);
} else {
- unlock_page(page);
+ folio_unlock(folio);
}
if (!ret)
ret = filemap_check_errors(mapping);
return ret;
}
-EXPORT_SYMBOL(write_one_page);
+EXPORT_SYMBOL(folio_write_one);
/*
* For address_spaces which do not use buffers nor write back.
@@ -2438,29 +2445,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback);
*
* NOTE: This relies on being atomic wrt interrupts.
*/
-static void account_page_dirtied(struct page *page,
+static void folio_account_dirtied(struct folio *folio,
struct address_space *mapping)
{
struct inode *inode = mapping->host;
- trace_writeback_dirty_page(page, mapping);
+ trace_writeback_dirty_folio(folio, mapping);
if (mapping_can_writeback(mapping)) {
struct bdi_writeback *wb;
+ long nr = folio_nr_pages(folio);
- inode_attach_wb(inode, page);
+ inode_attach_wb(inode, &folio->page);
wb = inode_to_wb(inode);
- __inc_lruvec_page_state(page, NR_FILE_DIRTY);
- __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- __inc_node_page_state(page, NR_DIRTIED);
- inc_wb_stat(wb, WB_RECLAIMABLE);
- inc_wb_stat(wb, WB_DIRTIED);
- task_io_account_write(PAGE_SIZE);
- current->nr_dirtied++;
- __this_cpu_inc(bdp_ratelimits);
+ __lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+ __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
+ __node_stat_mod_folio(folio, NR_DIRTIED, nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, nr);
+ wb_stat_mod(wb, WB_DIRTIED, nr);
+ task_io_account_write(nr * PAGE_SIZE);
+ current->nr_dirtied += nr;
+ __this_cpu_add(bdp_ratelimits, nr);
- mem_cgroup_track_foreign_dirty(page, wb);
+ mem_cgroup_track_foreign_dirty(folio, wb);
}
}
@@ -2469,130 +2477,152 @@ static void account_page_dirtied(struct page *page,
*
* Caller must hold lock_page_memcg().
*/
-void account_page_cleaned(struct page *page, struct address_space *mapping,
+void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
struct bdi_writeback *wb)
{
if (mapping_can_writeback(mapping)) {
- dec_lruvec_page_state(page, NR_FILE_DIRTY);
- dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- dec_wb_stat(wb, WB_RECLAIMABLE);
- task_io_account_cancelled_write(PAGE_SIZE);
+ long nr = folio_nr_pages(folio);
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+ task_io_account_cancelled_write(nr * PAGE_SIZE);
}
}
/*
- * Mark the page dirty, and set it dirty in the page cache, and mark the inode
- * dirty.
+ * Mark the folio dirty, and set it dirty in the page cache, and mark
+ * the inode dirty.
*
- * If warn is true, then emit a warning if the page is not uptodate and has
+ * If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
* The caller must hold lock_page_memcg().
*/
-void __set_page_dirty(struct page *page, struct address_space *mapping,
+void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
int warn)
{
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- if (page->mapping) { /* Race with truncate? */
- WARN_ON_ONCE(warn && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- __xa_set_mark(&mapping->i_pages, page_index(page),
+ if (folio->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
+ folio_account_dirtied(folio, mapping);
+ __xa_set_mark(&mapping->i_pages, folio_index(folio),
PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
-/*
- * For address_spaces which do not use buffers. Just tag the page as dirty in
- * the xarray.
+/**
+ * filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
+ * @mapping: Address space this folio belongs to.
+ * @folio: Folio to be marked as dirty.
*
- * This is also used when a single buffer is being dirtied: we want to set the
- * page dirty in that case, but not all the buffers. This is a "bottom-up"
- * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ * Filesystems which do not use buffer heads should call this function
+ * from their set_page_dirty address space operation. It ignores the
+ * contents of folio_get_private(), so if the filesystem marks individual
+ * blocks as dirty, the filesystem should handle that itself.
*
- * The caller must ensure this doesn't race with truncation. Most will simply
- * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
- * the pte lock held, which also locks out truncation.
+ * This is also sometimes used by filesystems which use buffer_heads when
+ * a single buffer is being dirtied: we want to set the folio dirty in
+ * that case, but not all the buffers. This is a "bottom-up" dirtying,
+ * whereas __set_page_dirty_buffers() is a "top-down" dirtying.
+ *
+ * The caller must ensure this doesn't race with truncation. Most will
+ * simply hold the folio lock, but e.g. zap_pte_range() calls with the
+ * folio mapped and the pte lock held, which also locks out truncation.
*/
-int __set_page_dirty_nobuffers(struct page *page)
+bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
- lock_page_memcg(page);
- if (!TestSetPageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ folio_memcg_lock(folio);
+ if (folio_test_set_dirty(folio)) {
+ folio_memcg_unlock(folio);
+ return false;
+ }
- if (!mapping) {
- unlock_page_memcg(page);
- return 1;
- }
- __set_page_dirty(page, mapping, !PagePrivate(page));
- unlock_page_memcg(page);
+ __folio_mark_dirty(folio, mapping, !folio_test_private(folio));
+ folio_memcg_unlock(folio);
- if (mapping->host) {
- /* !PageAnon && !swapper_space */
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- }
- return 1;
+ if (mapping->host) {
+ /* !PageAnon && !swapper_space */
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
- unlock_page_memcg(page);
- return 0;
+ return true;
}
-EXPORT_SYMBOL(__set_page_dirty_nobuffers);
+EXPORT_SYMBOL(filemap_dirty_folio);
-/*
- * Call this whenever redirtying a page, to de-account the dirty counters
- * (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
- * counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
- * systematic errors in balanced_dirty_ratelimit and the dirty pages position
- * control.
+/**
+ * folio_account_redirty - Manually account for redirtying a page.
+ * @folio: The folio which is being redirtied.
+ *
+ * Most filesystems should call folio_redirty_for_writepage() instead
+ * of this fuction. If your filesystem is doing writeback outside the
+ * context of a writeback_control(), it can call this when redirtying
+ * a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
+ * tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
+ * WB_WRITTEN) in long term. The mismatches will lead to systematic errors
+ * in balanced_dirty_ratelimit and the dirty pages position control.
*/
-void account_page_redirty(struct page *page)
+void folio_account_redirty(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
+ long nr = folio_nr_pages(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
- current->nr_dirtied--;
- dec_node_page_state(page, NR_DIRTIED);
- dec_wb_stat(wb, WB_DIRTIED);
+ current->nr_dirtied -= nr;
+ node_stat_mod_folio(folio, NR_DIRTIED, -nr);
+ wb_stat_mod(wb, WB_DIRTIED, -nr);
unlocked_inode_to_wb_end(inode, &cookie);
}
}
-EXPORT_SYMBOL(account_page_redirty);
+EXPORT_SYMBOL(folio_account_redirty);
-/*
- * When a writepage implementation decides that it doesn't want to write this
- * page for some reason, it should redirty the locked page via
- * redirty_page_for_writepage() and it should then unlock the page and return 0
+/**
+ * folio_redirty_for_writepage - Decline to write a dirty folio.
+ * @wbc: The writeback control.
+ * @folio: The folio.
+ *
+ * When a writepage implementation decides that it doesn't want to write
+ * @folio for some reason, it should call this function, unlock @folio and
+ * return 0.
+ *
+ * Return: True if we redirtied the folio. False if someone else dirtied
+ * it first.
*/
-int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
+bool folio_redirty_for_writepage(struct writeback_control *wbc,
+ struct folio *folio)
{
- int ret;
+ bool ret;
+ long nr = folio_nr_pages(folio);
+
+ wbc->pages_skipped += nr;
+ ret = filemap_dirty_folio(folio->mapping, folio);
+ folio_account_redirty(folio);
- wbc->pages_skipped++;
- ret = __set_page_dirty_nobuffers(page);
- account_page_redirty(page);
return ret;
}
-EXPORT_SYMBOL(redirty_page_for_writepage);
+EXPORT_SYMBOL(folio_redirty_for_writepage);
-/*
- * Dirty a page.
+/**
+ * folio_mark_dirty - Mark a folio as being modified.
+ * @folio: The folio.
+ *
+ * For folios with a mapping this should be done under the page lock
+ * for the benefit of asynchronous memory errors who prefer a consistent
+ * dirty state. This rule can be broken in some special cases,
+ * but should be better not to.
*
- * For pages with a mapping this should be done under the page lock for the
- * benefit of asynchronous memory errors who prefer a consistent dirty state.
- * This rule can be broken in some special cases, but should be better not to.
+ * Return: True if the folio was newly dirtied, false if it was already dirty.
*/
-int set_page_dirty(struct page *page)
+bool folio_mark_dirty(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio_mapping(folio);
- page = compound_head(page);
if (likely(mapping)) {
/*
* readahead/lru_deactivate_page could remain
@@ -2604,17 +2634,17 @@ int set_page_dirty(struct page *page)
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
- if (PageReclaim(page))
- ClearPageReclaim(page);
- return mapping->a_ops->set_page_dirty(page);
+ if (folio_test_reclaim(folio))
+ folio_clear_reclaim(folio);
+ return mapping->a_ops->set_page_dirty(&folio->page);
}
- if (!PageDirty(page)) {
- if (!TestSetPageDirty(page))
- return 1;
+ if (!folio_test_dirty(folio)) {
+ if (!folio_test_set_dirty(folio))
+ return true;
}
- return 0;
+ return false;
}
-EXPORT_SYMBOL(set_page_dirty);
+EXPORT_SYMBOL(folio_mark_dirty);
/*
* set_page_dirty() is racy if the caller has no reference against
@@ -2650,49 +2680,49 @@ EXPORT_SYMBOL(set_page_dirty_lock);
* page without actually doing it through the VM. Can you say "ext3 is
* horribly ugly"? Thought you could.
*/
-void __cancel_dirty_page(struct page *page)
+void __folio_cancel_dirty(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
+ struct address_space *mapping = folio_mapping(folio);
if (mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
- if (TestClearPageDirty(page))
- account_page_cleaned(page, mapping, wb);
+ if (folio_test_clear_dirty(folio))
+ folio_account_cleaned(folio, mapping, wb);
unlocked_inode_to_wb_end(inode, &cookie);
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
} else {
- ClearPageDirty(page);
+ folio_clear_dirty(folio);
}
}
-EXPORT_SYMBOL(__cancel_dirty_page);
+EXPORT_SYMBOL(__folio_cancel_dirty);
/*
- * Clear a page's dirty flag, while caring for dirty memory accounting.
- * Returns true if the page was previously dirty.
- *
- * This is for preparing to put the page under writeout. We leave the page
- * tagged as dirty in the xarray so that a concurrent write-for-sync
- * can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
- * implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and xarray dirty tag
- * back into sync.
- *
- * This incoherency between the page's dirty flag and xarray tag is
- * unfortunate, but it only exists while the page is locked.
+ * Clear a folio's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the folio was previously dirty.
+ *
+ * This is for preparing to put the folio under writeout. We leave
+ * the folio tagged as dirty in the xarray so that a concurrent
+ * write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
+ * The ->writepage implementation will run either folio_start_writeback()
+ * or folio_mark_dirty(), at which stage we bring the folio's dirty flag
+ * and xarray dirty tag back into sync.
+ *
+ * This incoherency between the folio's dirty flag and xarray tag is
+ * unfortunate, but it only exists while the folio is locked.
*/
-int clear_page_dirty_for_io(struct page *page)
+bool folio_clear_dirty_for_io(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
- int ret = 0;
+ struct address_space *mapping = folio_mapping(folio);
+ bool ret = false;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
@@ -2705,48 +2735,49 @@ int clear_page_dirty_for_io(struct page *page)
* We use this sequence to make sure that
* (a) we account for dirty stats properly
* (b) we tell the low-level filesystem to
- * mark the whole page dirty if it was
+ * mark the whole folio dirty if it was
* dirty in a pagetable. Only to then
- * (c) clean the page again and return 1 to
+ * (c) clean the folio again and return 1 to
* cause the writeback.
*
* This way we avoid all nasty races with the
* dirty bit in multiple places and clearing
* them concurrently from different threads.
*
- * Note! Normally the "set_page_dirty(page)"
+ * Note! Normally the "folio_mark_dirty(folio)"
* has no effect on the actual dirty bit - since
* that will already usually be set. But we
* need the side effects, and it can help us
* avoid races.
*
- * We basically use the page "master dirty bit"
+ * We basically use the folio "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
*/
- if (page_mkclean(page))
- set_page_dirty(page);
+ if (folio_mkclean(folio))
+ folio_mark_dirty(folio);
/*
* We carefully synchronise fault handlers against
- * installing a dirty pte and marking the page dirty
+ * installing a dirty pte and marking the folio dirty
* at this point. We do this by having them hold the
- * page lock while dirtying the page, and pages are
+ * page lock while dirtying the folio, and folios are
* always locked coming in here, so we get the desired
* exclusion.
*/
wb = unlocked_inode_to_wb_begin(inode, &cookie);
- if (TestClearPageDirty(page)) {
- dec_lruvec_page_state(page, NR_FILE_DIRTY);
- dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- dec_wb_stat(wb, WB_RECLAIMABLE);
- ret = 1;
+ if (folio_test_clear_dirty(folio)) {
+ long nr = folio_nr_pages(folio);
+ lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
+ ret = true;
}
unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
- return TestClearPageDirty(page);
+ return folio_test_clear_dirty(folio);
}
-EXPORT_SYMBOL(clear_page_dirty_for_io);
+EXPORT_SYMBOL(folio_clear_dirty_for_io);
static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
@@ -2766,27 +2797,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb)
queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
}
-int test_clear_page_writeback(struct page *page)
+bool __folio_end_writeback(struct folio *folio)
{
- struct address_space *mapping = page_mapping(page);
- int ret;
+ long nr = folio_nr_pages(folio);
+ struct address_space *mapping = folio_mapping(folio);
+ bool ret;
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
- ret = TestClearPageWriteback(page);
+ ret = folio_test_clear_writeback(folio);
if (ret) {
- __xa_clear_mark(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, folio_index(folio),
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
- dec_wb_stat(wb, WB_WRITEBACK);
- __wb_writeout_inc(wb);
+ wb_stat_mod(wb, WB_WRITEBACK, -nr);
+ __wb_writeout_add(wb, nr);
if (!mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK))
wb_inode_writeback_end(wb);
@@ -2799,32 +2831,34 @@ int test_clear_page_writeback(struct page *page)
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
- ret = TestClearPageWriteback(page);
+ ret = folio_test_clear_writeback(folio);
}
if (ret) {
- dec_lruvec_page_state(page, NR_WRITEBACK);
- dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
- inc_node_page_state(page, NR_WRITTEN);
+ lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
+ node_stat_mod_folio(folio, NR_WRITTEN, nr);
}
- unlock_page_memcg(page);
+ folio_memcg_unlock(folio);
return ret;
}
-int __test_set_page_writeback(struct page *page, bool keep_write)
+bool __folio_start_writeback(struct folio *folio, bool keep_write)
{
- struct address_space *mapping = page_mapping(page);
- int ret, access_ret;
+ long nr = folio_nr_pages(folio);
+ struct address_space *mapping = folio_mapping(folio);
+ bool ret;
+ int access_ret;
- lock_page_memcg(page);
+ folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
- XA_STATE(xas, &mapping->i_pages, page_index(page));
+ XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_load(&xas);
- ret = TestSetPageWriteback(page);
+ ret = folio_test_set_writeback(folio);
if (!ret) {
bool on_wblist;
@@ -2835,84 +2869,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
- inc_wb_stat(wb, WB_WRITEBACK);
+ wb_stat_mod(wb, WB_WRITEBACK, nr);
if (!on_wblist)
wb_inode_writeback_start(wb);
}
/*
- * We can come through here when swapping anonymous
- * pages, so we don't necessarily have an inode to track
- * for sync.
+ * We can come through here when swapping
+ * anonymous folios, so we don't necessarily
+ * have an inode to track for sync.
*/
if (mapping->host && !on_wblist)
sb_mark_inode_writeback(mapping->host);
}
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
} else {
- ret = TestSetPageWriteback(page);
+ ret = folio_test_set_writeback(folio);
}
if (!ret) {
- inc_lruvec_page_state(page, NR_WRITEBACK);
- inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
+ lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
+ zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
}
- unlock_page_memcg(page);
- access_ret = arch_make_page_accessible(page);
+ folio_memcg_unlock(folio);
+ access_ret = arch_make_folio_accessible(folio);
/*
* If writeback has been triggered on a page that cannot be made
* accessible, it is too late to recover here.
*/
- VM_BUG_ON_PAGE(access_ret != 0, page);
+ VM_BUG_ON_FOLIO(access_ret != 0, folio);
return ret;
-
}
-EXPORT_SYMBOL(__test_set_page_writeback);
+EXPORT_SYMBOL(__folio_start_writeback);
-/*
- * Wait for a page to complete writeback
+/**
+ * folio_wait_writeback - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete.
+ *
+ * Context: Sleeps. Must be called in process context and with
+ * no spinlocks held. Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
*/
-void wait_on_page_writeback(struct page *page)
+void folio_wait_writeback(struct folio *folio)
{
- while (PageWriteback(page)) {
- trace_wait_on_page_writeback(page, page_mapping(page));
- wait_on_page_bit(page, PG_writeback);
+ while (folio_test_writeback(folio)) {
+ trace_folio_wait_writeback(folio, folio_mapping(folio));
+ folio_wait_bit(folio, PG_writeback);
}
}
-EXPORT_SYMBOL_GPL(wait_on_page_writeback);
+EXPORT_SYMBOL_GPL(folio_wait_writeback);
-/*
- * Wait for a page to complete writeback. Returns -EINTR if we get a
- * fatal signal while waiting.
+/**
+ * folio_wait_writeback_killable - Wait for a folio to finish writeback.
+ * @folio: The folio to wait for.
+ *
+ * If the folio is currently being written back to storage, wait for the
+ * I/O to complete or a fatal signal to arrive.
+ *
+ * Context: Sleeps. Must be called in process context and with
+ * no spinlocks held. Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
+ * Return: 0 on success, -EINTR if we get a fatal signal while waiting.
*/
-int wait_on_page_writeback_killable(struct page *page)
+int folio_wait_writeback_killable(struct folio *folio)
{
- while (PageWriteback(page)) {
- trace_wait_on_page_writeback(page, page_mapping(page));
- if (wait_on_page_bit_killable(page, PG_writeback))
+ while (folio_test_writeback(folio)) {
+ trace_folio_wait_writeback(folio, folio_mapping(folio));
+ if (folio_wait_bit_killable(folio, PG_writeback))
return -EINTR;
}
return 0;
}
-EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
+EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
/**
- * wait_for_stable_page() - wait for writeback to finish, if necessary.
- * @page: The page to wait on.
+ * folio_wait_stable() - wait for writeback to finish, if necessary.
+ * @folio: The folio to wait on.
+ *
+ * This function determines if the given folio is related to a backing
+ * device that requires folio contents to be held stable during writeback.
+ * If so, then it will wait for any pending writeback to complete.
*
- * This function determines if the given page is related to a backing device
- * that requires page contents to be held stable during writeback. If so, then
- * it will wait for any pending writeback to complete.
+ * Context: Sleeps. Must be called in process context and with
+ * no spinlocks held. Caller should hold a reference on the folio.
+ * If the folio is not locked, writeback may start again after writeback
+ * has finished.
*/
-void wait_for_stable_page(struct page *page)
+void folio_wait_stable(struct folio *folio)
{
- page = thp_head(page);
- if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
- wait_on_page_writeback(page);
+ if (folio->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
+ folio_wait_writeback(folio);
}
-EXPORT_SYMBOL_GPL(wait_for_stable_page);
+EXPORT_SYMBOL_GPL(folio_wait_stable);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b37435c274cf..c5952749ad40 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -677,10 +677,8 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order > PAGE_ALLOC_COSTLY_ORDER) {
+ if (order > PAGE_ALLOC_COSTLY_ORDER)
order = pageblock_order;
- VM_BUG_ON(order != pageblock_order);
- }
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
@@ -724,7 +722,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
void free_compound_page(struct page *page)
{
- mem_cgroup_uncharge(page);
+ mem_cgroup_uncharge(page_folio(page));
free_the_page(page, compound_order(page));
}
@@ -1312,8 +1310,10 @@ static __always_inline bool free_pages_prepare(struct page *page,
VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
- if (compound)
+ if (compound) {
ClearPageDoubleMap(page);
+ ClearPageHasHWPoisoned(page);
+ }
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
@@ -1428,14 +1428,8 @@ static inline void prefetch_buddy(struct page *page)
/*
* Frees a number of pages from the PCP lists
- * Assumes all pages on list are in same zone, and of same order.
+ * Assumes all pages on list are in same zone.
* count is the number of pages to free.
- *
- * If the zone was previously in an "all pages pinned" state then look to
- * see if this freeing clears that state.
- *
- * And clear the zone's pages_scanned counter, to hold off the "all pages are
- * pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
@@ -1589,7 +1583,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
- if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ if (zone_spans_pfn(zone, pfn))
break;
}
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
@@ -3147,9 +3141,9 @@ static void drain_local_pages_wq(struct work_struct *work)
* cpu which is alright but we also have to make sure to not move to
* a different one.
*/
- preempt_disable();
+ migrate_disable();
drain_local_pages(drain->zone);
- preempt_enable();
+ migrate_enable();
}
/*
@@ -3966,6 +3960,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
@@ -4795,30 +4791,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
- /*
- * If we didn't make any progress and have a lot of
- * dirty + writeback pages then we should wait for
- * an IO to complete to slow down the reclaim and
- * prevent from pre mature OOM
- */
- if (!did_some_progress) {
- unsigned long write_pending;
-
- write_pending = zone_page_state_snapshot(zone,
- NR_ZONE_WRITE_PENDING);
-
- if (2 * write_pending > reclaimable) {
- congestion_wait(BLK_RW_ASYNC, HZ/10);
- return true;
- }
- }
-
ret = true;
- goto out;
+ break;
}
}
-out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
@@ -4914,6 +4891,19 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
+ /*
+ * Check for insane configurations where the cpuset doesn't contain
+ * any suitable zone to satisfy the request - e.g. non-movable
+ * GFP_HIGHUSER allocations from MOVABLE nodes only.
+ */
+ if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
+ struct zoneref *z = first_zones_zonelist(ac->zonelist,
+ ac->highest_zoneidx,
+ &cpuset_current_mems_allowed);
+ if (!z->zone)
+ goto nopage;
+ }
+
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@@ -5223,6 +5213,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
if (unlikely(page_array && nr_pages - nr_populated == 0))
goto out;
+ /* Bulk allocator does not support memcg accounting. */
+ if (memcg_kmem_enabled() && (gfp & __GFP_ACCOUNT))
+ goto failed;
+
/* Use the single page allocator for one page. */
if (nr_pages - nr_populated == 1)
goto failed;
@@ -5400,6 +5394,18 @@ out:
}
EXPORT_SYMBOL(__alloc_pages);
+struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
+ nodemask_t *nodemask)
+{
+ struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
+ preferred_nid, nodemask);
+
+ if (page && order > 1)
+ prep_transhuge_page(page);
+ return (struct folio *)page;
+}
+EXPORT_SYMBOL(__folio_alloc);
+
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
* address cannot represent highmem pages. Use alloc_pages and then kmap if
@@ -5612,8 +5618,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
- if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
- gfp_mask &= ~__GFP_COMP;
+ if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+ gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
@@ -5637,8 +5643,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
struct page *p;
- if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
- gfp_mask &= ~__GFP_COMP;
+ if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
+ gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
@@ -5980,6 +5986,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk(KERN_CONT
"%s"
" free:%lukB"
+ " boost:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
@@ -6000,6 +6007,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone->watermark_boost),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -6255,7 +6263,7 @@ static void build_zonelists(pg_data_t *pgdat)
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
- node_load[node] = load;
+ node_load[node] += load;
node_order[nr_nodes++] = node;
prev_node = node;
@@ -6264,6 +6272,10 @@ static void build_zonelists(pg_data_t *pgdat)
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
+ pr_info("Fallback order for Node %d: ", local_node);
+ for (node = 0; node < nr_nodes; node++)
+ pr_cont("%d ", node_order[node]);
+ pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@@ -7389,6 +7401,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
+ int i;
+
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
@@ -7397,6 +7411,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
+ for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
+ init_waitqueue_head(&pgdat->reclaim_wait[i]);
+
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
}
@@ -8126,8 +8143,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
}
if (pages && s)
- pr_info("Freeing %s memory: %ldK\n",
- s, pages << (PAGE_SHIFT - 10));
+ pr_info("Freeing %s memory: %ldK\n", s, K(pages));
return pages;
}
@@ -8172,14 +8188,13 @@ void __init mem_init_print_info(void)
", %luK highmem"
#endif
")\n",
- nr_free_pages() << (PAGE_SHIFT - 10),
- physpages << (PAGE_SHIFT - 10),
+ K(nr_free_pages()), K(physpages),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
- totalcma_pages << (PAGE_SHIFT - 10)
+ K(physpages - totalram_pages() - totalcma_pages),
+ K(totalcma_pages)
#ifdef CONFIG_HIGHMEM
- , totalhigh_pages() << (PAGE_SHIFT - 10)
+ , K(totalhigh_pages())
#endif
);
}
@@ -8452,7 +8467,7 @@ void setup_per_zone_wmarks(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
-int __meminit init_per_zone_wmark_min(void)
+void calculate_min_free_kbytes(void)
{
unsigned long lowmem_kbytes;
int new_min_free_kbytes;
@@ -8460,16 +8475,17 @@ int __meminit init_per_zone_wmark_min(void)
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
- if (new_min_free_kbytes > user_min_free_kbytes) {
- min_free_kbytes = new_min_free_kbytes;
- if (min_free_kbytes < 128)
- min_free_kbytes = 128;
- if (min_free_kbytes > 262144)
- min_free_kbytes = 262144;
- } else {
+ if (new_min_free_kbytes > user_min_free_kbytes)
+ min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
+ else
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
- }
+
+}
+
+int __meminit init_per_zone_wmark_min(void)
+{
+ calculate_min_free_kbytes();
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
@@ -8756,7 +8772,8 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
- huge = is_vm_area_hugepages(table);
+ if (table)
+ huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@@ -9353,21 +9370,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
+/*
+ * This function returns a stable result only if called under zone lock.
+ */
bool is_free_buddy_page(struct page *page)
{
- struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
- unsigned long flags;
unsigned int order;
- spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
- if (PageBuddy(page_head) && buddy_order(page_head) >= order)
+ if (PageBuddy(page_head) &&
+ buddy_order_unsafe(page_head) >= order)
break;
}
- spin_unlock_irqrestore(&zone->lock, flags);
return order < MAX_ORDER;
}
diff --git a/mm/page_ext.c b/mm/page_ext.c
index dfb91653d359..6242afb24d84 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -201,7 +201,7 @@ fail:
panic("Out of memory");
}
-#else /* CONFIG_FLATMEM */
+#else /* CONFIG_SPARSEMEM */
struct page_ext *lookup_page_ext(const struct page *page)
{
@@ -269,7 +269,7 @@ static int __meminit init_section_page_ext(unsigned long pfn, int nid)
total_usage += table_size;
return 0;
}
-#ifdef CONFIG_MEMORY_HOTPLUG
+
static void free_page_ext(void *addr)
{
if (is_vmalloc_addr(addr)) {
@@ -374,8 +374,6 @@ static int __meminit page_ext_callback(struct notifier_block *self,
return notifier_from_errno(ret);
}
-#endif
-
void __init page_ext_init(void)
{
unsigned long pfn;
diff --git a/mm/page_io.c b/mm/page_io.c
index c493ce9ebcf5..9725c7e1eeea 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -38,7 +38,7 @@ void end_swap_bio_write(struct bio *bio)
* Also print a dire warning that things will go BAD (tm)
* very quickly.
*
- * Also clear PG_reclaim to avoid rotate_reclaimable_page()
+ * Also clear PG_reclaim to avoid folio_rotate_reclaimable()
*/
set_page_dirty(page);
pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
@@ -317,7 +317,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
* temporary failure if the system has limited
* memory for allocating transmit buffers.
* Mark the page dirty and avoid
- * rotate_reclaimable_page but rate-limit the
+ * folio_rotate_reclaimable but rate-limit the
* messages but do not flag PageError like
* the normal direct-to-bio case as it could
* be temporary.
@@ -358,8 +358,6 @@ int swap_readpage(struct page *page, bool synchronous)
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
- blk_qc_t qc;
- struct gendisk *disk;
unsigned long pflags;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
@@ -409,26 +407,24 @@ int swap_readpage(struct page *page, bool synchronous)
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
bio_add_page(bio, page, thp_size(page), 0);
-
- disk = bio->bi_bdev->bd_disk;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
if (synchronous) {
- bio->bi_opf |= REQ_HIPRI;
+ bio->bi_opf |= REQ_POLLED;
get_task_struct(current);
bio->bi_private = current;
}
count_vm_event(PSWPIN);
bio_get(bio);
- qc = submit_bio(bio);
+ submit_bio(bio);
while (synchronous) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio->bi_private))
break;
- if (!blk_poll(disk->queue, qc, true))
+ if (!bio_poll(bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index a95c2c6562d0..f67c4c70f17f 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
buddy = page + (buddy_pfn - pfn);
if (!is_migrate_isolate_page(buddy)) {
- __isolate_free_page(page, order);
- isolated_page = true;
+ isolated_page = !!__isolate_free_page(page, order);
+ /*
+ * Isolating a free page in an isolated pageblock
+ * is expected to always work as watermarks don't
+ * apply here.
+ */
+ VM_WARN_ON(!isolated_page);
}
}
}
@@ -183,7 +188,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
{
unsigned long pfn;
- unsigned long undo_pfn;
struct page *page;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
@@ -193,25 +197,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page) {
- if (set_migratetype_isolate(page, migratetype, flags)) {
- undo_pfn = pfn;
- goto undo;
- }
+ if (page && set_migratetype_isolate(page, migratetype, flags)) {
+ undo_isolate_page_range(start_pfn, pfn, migratetype);
+ return -EBUSY;
}
}
return 0;
-undo:
- for (pfn = start_pfn;
- pfn < undo_pfn;
- pfn += pageblock_nr_pages) {
- struct page *page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- unset_migratetype_isolate(page, migratetype);
- }
-
- return -EBUSY;
}
/*
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 62402d22539b..79936db59859 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -210,10 +210,10 @@ void __split_page_owner(struct page *page, unsigned int nr)
}
}
-void __copy_page_owner(struct page *oldpage, struct page *newpage)
+void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
- struct page_ext *old_ext = lookup_page_ext(oldpage);
- struct page_ext *new_ext = lookup_page_ext(newpage);
+ struct page_ext *old_ext = lookup_page_ext(&old->page);
+ struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
struct page_owner *old_page_owner, *new_page_owner;
if (unlikely(!old_ext || !new_ext))
@@ -231,11 +231,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
/*
- * We don't clear the bit on the oldpage as it's going to be freed
+ * We don't clear the bit on the old folio as it's going to be freed
* after migration. Until then, the info can be useful in case of
* a bug, and the overall stats will be off a bit only temporarily.
* Also, migrate_misplaced_transhuge_page() can still fail the
- * migration and then we want the oldpage to retain the info. But
+ * migration and then we want the old folio to retain the info. But
* in that case we also don't need to explicitly clear the info from
* the new page, which will be freed.
*/
@@ -329,8 +329,6 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
depot_stack_handle_t handle)
{
int ret, pageblock_mt, page_mt;
- unsigned long *entries;
- unsigned int nr_entries;
char *kbuf;
count = min_t(size_t, count, PAGE_SIZE);
@@ -351,18 +349,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
- "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
+ "PFN %lu type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
migratetype_names[pageblock_mt],
- page->flags, &page->flags);
+ &page->flags);
if (ret >= count)
goto err;
- nr_entries = stack_depot_fetch(handle, &entries);
- ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
+ ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
@@ -394,8 +391,6 @@ void __dump_page_owner(const struct page *page)
struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
- unsigned long *entries;
- unsigned int nr_entries;
gfp_t gfp_mask;
int mt;
@@ -423,20 +418,17 @@ void __dump_page_owner(const struct page *page)
page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
- if (!handle) {
+ if (!handle)
pr_alert("page_owner allocation stack trace missing\n");
- } else {
- nr_entries = stack_depot_fetch(handle, &entries);
- stack_trace_print(entries, nr_entries, 0);
- }
+ else
+ stack_depot_print(handle);
handle = READ_ONCE(page_owner->free_handle);
if (!handle) {
pr_alert("page_owner free stack trace missing\n");
} else {
- nr_entries = stack_depot_fetch(handle, &entries);
pr_alert("page last free stack trace:\n");
- stack_trace_print(entries, nr_entries, 0);
+ stack_depot_print(handle);
}
if (page_owner->last_migrate_reason != -1)
diff --git a/mm/percpu.c b/mm/percpu.c
index e0a986818903..f5b2c2ea5a54 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2472,7 +2472,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
- memblock_free_early(__pa(ai), ai->__ai_size);
+ memblock_free(ai, ai->__ai_size);
}
/**
@@ -3134,7 +3134,7 @@ out_free_areas:
out_free:
pcpu_free_alloc_info(ai);
if (areas)
- memblock_free_early(__pa(areas), areas_size);
+ memblock_free(areas, areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
@@ -3256,7 +3256,7 @@ enomem:
free_fn(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
- memblock_free_early(__pa(pages), pages_size);
+ memblock_free(pages, pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@@ -3286,7 +3286,7 @@ static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
- memblock_free_early(__pa(ptr), size);
+ memblock_free(ptr, size);
}
void __init setup_per_cpu_areas(void)
diff --git a/mm/readahead.c b/mm/readahead.c
index 41b75d76d36e..6ae5693de28c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -12,7 +12,6 @@
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
-#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
@@ -309,7 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
- * 1-8 page = 32k initial, > 8 page = 128k initial
+ * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
*/
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
diff --git a/mm/rmap.c b/mm/rmap.c
index 6aebd1747251..163ac4e6bcee 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -34,7 +34,7 @@
* mapping->private_lock (in __set_page_dirty_buffers)
* lock_page_memcg move_lock (in __set_page_dirty_buffers)
* i_pages lock (widely used)
- * lruvec->lru_lock (in lock_page_lruvec_irq)
+ * lruvec->lru_lock (in folio_lruvec_lock_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
@@ -981,7 +981,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
return true;
}
-int page_mkclean(struct page *page)
+int folio_mkclean(struct folio *folio)
{
int cleaned = 0;
struct address_space *mapping;
@@ -991,20 +991,20 @@ int page_mkclean(struct page *page)
.invalid_vma = invalid_mkclean_vma,
};
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
- if (!page_mapped(page))
+ if (!folio_mapped(folio))
return 0;
- mapping = page_mapping(page);
+ mapping = folio_mapping(folio);
if (!mapping)
return 0;
- rmap_walk(page, &rwc);
+ rmap_walk(&folio->page, &rwc);
return cleaned;
}
-EXPORT_SYMBOL_GPL(page_mkclean);
+EXPORT_SYMBOL_GPL(folio_mkclean);
/**
* page_move_anon_rmap - move a page to our anon_vma
@@ -1807,6 +1807,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (is_zone_device_page(page)) {
+ unsigned long pfn = page_to_pfn(page);
swp_entry_t entry;
pte_t swp_pte;
@@ -1815,8 +1816,11 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
- entry = make_readable_migration_entry(
- page_to_pfn(page));
+ entry = pte_to_swp_entry(pteval);
+ if (is_writable_device_private_entry(entry))
+ entry = make_writable_migration_entry(pfn);
+ else
+ entry = make_readable_migration_entry(pfn);
swp_pte = swp_entry_to_pte(entry);
/*
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 1fea68b8d5a6..22b310adb53d 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -18,7 +18,6 @@
#include <linux/secretmem.h>
#include <linux/set_memory.h>
#include <linux/sched/signal.h>
-#include <linux/refcount.h>
#include <uapi/linux/magic.h>
@@ -41,11 +40,11 @@ module_param_named(enable, secretmem_enable, bool, 0400);
MODULE_PARM_DESC(secretmem_enable,
"Enable secretmem and memfd_secret(2) system call");
-static refcount_t secretmem_users;
+static atomic_t secretmem_users;
bool secretmem_active(void)
{
- return !!refcount_read(&secretmem_users);
+ return !!atomic_read(&secretmem_users);
}
static vm_fault_t secretmem_fault(struct vm_fault *vmf)
@@ -104,7 +103,7 @@ static const struct vm_operations_struct secretmem_vm_ops = {
static int secretmem_release(struct inode *inode, struct file *file)
{
- refcount_dec(&secretmem_users);
+ atomic_dec(&secretmem_users);
return 0;
}
@@ -204,6 +203,8 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
return -EINVAL;
+ if (atomic_read(&secretmem_users) < 0)
+ return -ENFILE;
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
@@ -217,8 +218,8 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
file->f_flags |= O_LARGEFILE;
+ atomic_inc(&secretmem_users);
fd_install(fd, file);
- refcount_inc(&secretmem_users);
return fd;
err_put_fd:
diff --git a/mm/shmem.c b/mm/shmem.c
index b5860f4a2738..23c91a8beb78 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -59,7 +59,6 @@ static struct vfsmount *shm_mnt;
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
-#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
@@ -710,7 +709,7 @@ static int shmem_add_to_page_cache(struct page *page,
page->index = index;
if (!PageSwapCache(page)) {
- error = mem_cgroup_charge(page, charge_mm, gfp);
+ error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
if (error) {
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_FALLBACK);
@@ -856,9 +855,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
return swapped << PAGE_SHIFT;
/* Here comes the more involved part */
- return shmem_partial_swap_usage(mapping,
- linear_page_index(vma, vma->vm_start),
- linear_page_index(vma, vma->vm_end));
+ return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
+ vma->vm_pgoff + vma_pages(vma));
}
/*
@@ -1637,6 +1635,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct page *oldpage, *newpage;
+ struct folio *old, *new;
struct address_space *swap_mapping;
swp_entry_t entry;
pgoff_t swap_index;
@@ -1673,7 +1672,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
- mem_cgroup_migrate(oldpage, newpage);
+ old = page_folio(oldpage);
+ new = page_folio(newpage);
+ mem_cgroup_migrate(old, new);
__inc_lruvec_page_state(newpage, NR_FILE_PAGES);
__dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
}
@@ -2424,7 +2425,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
- SetPageDirty(page);
unlock_page(page);
return 0;
out_delete_from_cache:
@@ -2456,6 +2456,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ int ret = 0;
/* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
@@ -2466,7 +2467,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
+
+ if (*pagep && PageHWPoison(*pagep)) {
+ unlock_page(*pagep);
+ put_page(*pagep);
+ ret = -EIO;
+ }
+
+ return ret;
}
static int
@@ -2553,6 +2562,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (sgp == SGP_CACHE)
set_page_dirty(page);
unlock_page(page);
+
+ if (PageHWPoison(page)) {
+ put_page(page);
+ error = -EIO;
+ break;
+ }
}
/*
@@ -3114,7 +3129,8 @@ static const char *shmem_get_link(struct dentry *dentry,
page = find_get_page(inode->i_mapping, 0);
if (!page)
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
+ if (PageHWPoison(page) ||
+ !PageUptodate(page)) {
put_page(page);
return ERR_PTR(-ECHILD);
}
@@ -3122,6 +3138,11 @@ static const char *shmem_get_link(struct dentry *dentry,
error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
+ if (page && PageHWPoison(page)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-ECHILD);
+ }
unlock_page(page);
}
set_delayed_call(done, shmem_put_link, page);
@@ -3772,6 +3793,13 @@ static void shmem_destroy_inodecache(void)
kmem_cache_destroy(shmem_inode_cachep);
}
+/* Keep the page in page cache instead of truncating it */
+static int shmem_error_remove_page(struct address_space *mapping,
+ struct page *page)
+{
+ return 0;
+}
+
const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
@@ -3782,7 +3810,7 @@ const struct address_space_operations shmem_aops = {
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_page = shmem_error_remove_page,
};
EXPORT_SYMBOL(shmem_aops);
@@ -4193,6 +4221,10 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
page = ERR_PTR(error);
else
unlock_page(page);
+
+ if (PageHWPoison(page))
+ page = ERR_PTR(-EIO);
+
return page;
#else
/*
diff --git a/mm/slab.c b/mm/slab.c
index d0f725637663..da132a9ae6f8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1095,7 +1095,7 @@ static int slab_offline_cpu(unsigned int cpu)
return 0;
}
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_NUMA)
/*
* Drains freelist for a node on each slab cache, used for memory hot-remove.
* Returns -EBUSY if all objects cannot be drained so that the node is not
@@ -1157,7 +1157,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
out:
return notifier_from_errno(ret);
}
-#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
+#endif /* CONFIG_NUMA */
/*
* swap the static kmem_cache_node with kmalloced memory
@@ -3900,8 +3900,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (err)
goto end;
- if (limit && shared && batchcount)
- goto skip_setup;
/*
* The head array serves three purposes:
* - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -3944,7 +3942,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
limit = 32;
#endif
batchcount = (limit + 1) / 2;
-skip_setup:
err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
end:
if (err)
@@ -4207,19 +4204,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
n <= cachep->useroffset - offset + cachep->usersize)
return;
- /*
- * If the copy is still within the allocated object, produce
- * a warning instead of rejecting the copy. This is intended
- * to be a temporary method to find any missing usercopy
- * whitelists.
- */
- if (usercopy_fallback &&
- offset <= cachep->object_size &&
- n <= cachep->object_size - offset) {
- usercopy_warn("SLAB object", cachep->name, to_user, offset, n);
- return;
- }
-
usercopy_abort("SLAB object", cachep->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ec2bb0beed75..e5d080a93009 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,14 +37,6 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
-#ifdef CONFIG_HARDENED_USERCOPY
-bool usercopy_fallback __ro_after_init =
- IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
-module_param(usercopy_fallback, bool, 0400);
-MODULE_PARM_DESC(usercopy_fallback,
- "WARN instead of reject usercopy whitelist violations");
-#endif
-
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
diff --git a/mm/slub.c b/mm/slub.c
index 3d2025f7163b..f7368bfffb7a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -354,7 +354,7 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
- prefetch(object + s->offset);
+ prefetchw(object + s->offset);
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
@@ -414,6 +414,29 @@ static inline unsigned int oo_objects(struct kmem_cache_order_objects x)
return x.x & OO_MASK;
}
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+{
+ unsigned int nr_pages;
+
+ s->cpu_partial = nr_objects;
+
+ /*
+ * We take the number of objects but actually limit the number of
+ * pages on the per cpu partial list, in order to limit excessive
+ * growth of the list. For simplicity we assume that the pages will
+ * be half-full.
+ */
+ nr_pages = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
+ s->cpu_partial_pages = nr_pages;
+}
+#else
+static inline void
+slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
+{
+}
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
+
/*
* Per slab locking using the pagelock
*/
@@ -763,9 +786,9 @@ void print_tracking(struct kmem_cache *s, void *object)
static void print_page_info(struct page *page)
{
- pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%#lx(%pGp)\n",
+ pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
page, page->objects, page->inuse, page->freelist,
- page->flags, &page->flags);
+ &page->flags);
}
@@ -1701,7 +1724,8 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s,
}
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
- void **head, void **tail)
+ void **head, void **tail,
+ int *cnt)
{
void *object;
@@ -1728,6 +1752,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
*head = object;
if (!*tail)
*tail = object;
+ } else {
+ /*
+ * Adjust the reconstructed freelist depth
+ * accordingly if object's reuse is delayed.
+ */
+ --(*cnt);
}
} while (object != old_tail);
@@ -2045,7 +2075,7 @@ static inline void remove_partial(struct kmem_cache_node *n,
*/
static inline void *acquire_slab(struct kmem_cache *s,
struct kmem_cache_node *n, struct page *page,
- int mode, int *objects)
+ int mode)
{
void *freelist;
unsigned long counters;
@@ -2061,7 +2091,6 @@ static inline void *acquire_slab(struct kmem_cache *s,
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
- *objects = new.objects - new.inuse;
if (mode) {
new.inuse = page->objects;
new.freelist = NULL;
@@ -2099,9 +2128,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
{
struct page *page, *page2;
void *object = NULL;
- unsigned int available = 0;
unsigned long flags;
- int objects;
+ unsigned int partial_pages = 0;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -2119,11 +2147,10 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
if (!pfmemalloc_match(page, gfpflags))
continue;
- t = acquire_slab(s, n, page, object == NULL, &objects);
+ t = acquire_slab(s, n, page, object == NULL);
if (!t)
break;
- available += objects;
if (!object) {
*ret_page = page;
stat(s, ALLOC_FROM_PARTIAL);
@@ -2131,10 +2158,15 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
} else {
put_cpu_partial(s, page, 0);
stat(s, CPU_PARTIAL_NODE);
+ partial_pages++;
}
+#ifdef CONFIG_SLUB_CPU_PARTIAL
if (!kmem_cache_has_cpu_partial(s)
- || available > slub_cpu_partial(s) / 2)
+ || partial_pages > s->cpu_partial_pages / 2)
break;
+#else
+ break;
+#endif
}
spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2539,14 +2571,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
struct page *page_to_unfreeze = NULL;
unsigned long flags;
int pages = 0;
- int pobjects = 0;
local_lock_irqsave(&s->cpu_slab->lock, flags);
oldpage = this_cpu_read(s->cpu_slab->partial);
if (oldpage) {
- if (drain && oldpage->pobjects > slub_cpu_partial(s)) {
+ if (drain && oldpage->pages >= s->cpu_partial_pages) {
/*
* Partial array is full. Move the existing set to the
* per node partial list. Postpone the actual unfreezing
@@ -2555,16 +2586,13 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
page_to_unfreeze = oldpage;
oldpage = NULL;
} else {
- pobjects = oldpage->pobjects;
pages = oldpage->pages;
}
}
pages++;
- pobjects += page->objects - page->inuse;
page->pages = pages;
- page->pobjects = pobjects;
page->next = oldpage;
this_cpu_write(s->cpu_slab->partial, page);
@@ -3413,7 +3441,9 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long tid;
- memcg_slab_free_hook(s, &head, 1);
+ /* memcg_slab_free_hook() is already called for bulk free. */
+ if (!tail)
+ memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3480,7 +3510,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page,
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
*/
- if (slab_free_freelist_hook(s, &head, &tail))
+ if (slab_free_freelist_hook(s, &head, &tail, &cnt))
do_slab_free(s, page, head, tail, cnt, addr);
}
@@ -3513,7 +3543,9 @@ static inline void free_nonslab_page(struct page *page, void *object)
{
unsigned int order = compound_order(page);
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ if (WARN_ON_ONCE(!PageCompound(page)))
+ pr_warn_once("object pointer: 0x%p\n", object);
+
kfree_hook(object);
mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order));
__free_pages(page, order);
@@ -3980,6 +4012,8 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
static void set_cpu_partial(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_CPU_PARTIAL
+ unsigned int nr_objects;
+
/*
* cpu_partial determined the maximum number of objects kept in the
* per cpu partial lists of a processor.
@@ -3989,24 +4023,22 @@ static void set_cpu_partial(struct kmem_cache *s)
* filled up again with minimal effort. The slab will never hit the
* per node partial lists and therefore no locking will be required.
*
- * This setting also determines
- *
- * A) The number of objects from per cpu partial slabs dumped to the
- * per node list when we reach the limit.
- * B) The number of objects in cpu partial slabs to extract from the
- * per node list when we run out of per cpu objects. We only fetch
- * 50% to keep some capacity around for frees.
+ * For backwards compatibility reasons, this is determined as number
+ * of objects, even though we now limit maximum number of pages, see
+ * slub_set_cpu_partial()
*/
if (!kmem_cache_has_cpu_partial(s))
- slub_set_cpu_partial(s, 0);
+ nr_objects = 0;
else if (s->size >= PAGE_SIZE)
- slub_set_cpu_partial(s, 2);
+ nr_objects = 6;
else if (s->size >= 1024)
- slub_set_cpu_partial(s, 6);
+ nr_objects = 24;
else if (s->size >= 256)
- slub_set_cpu_partial(s, 13);
+ nr_objects = 52;
else
- slub_set_cpu_partial(s, 30);
+ nr_objects = 120;
+
+ slub_set_cpu_partial(s, nr_objects);
#endif
}
@@ -4203,8 +4235,8 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags)
if (alloc_kmem_cache_cpus(s))
return 0;
- free_kmem_cache_nodes(s);
error:
+ __kmem_cache_release(s);
return -EINVAL;
}
@@ -4457,7 +4489,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
{
struct kmem_cache *s;
unsigned int offset;
- size_t object_size;
bool is_kfence = is_kfence_address(ptr);
ptr = kasan_reset_tag(ptr);
@@ -4490,19 +4521,6 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
n <= s->useroffset - offset + s->usersize)
return;
- /*
- * If the copy is still within the allocated object, produce
- * a warning instead of rejecting the copy. This is intended
- * to be a temporary method to find any missing usercopy
- * whitelists.
- */
- object_size = slab_ksize(s);
- if (usercopy_fallback &&
- offset <= object_size && n <= object_size - offset) {
- usercopy_warn("SLUB object", s->name, to_user, offset, n);
- return;
- }
-
usercopy_abort("SLUB object", s->name, to_user, offset, n);
}
#endif /* CONFIG_HARDENED_USERCOPY */
@@ -4880,13 +4898,15 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
return 0;
err = sysfs_slab_add(s);
- if (err)
+ if (err) {
__kmem_cache_release(s);
+ return err;
+ }
if (s->flags & SLAB_STORE_USER)
debugfs_slab_add(s);
- return err;
+ return 0;
}
void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
@@ -5379,7 +5399,12 @@ SLAB_ATTR(min_partial);
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
- return sysfs_emit(buf, "%u\n", slub_cpu_partial(s));
+ unsigned int nr_partial = 0;
+#ifdef CONFIG_SLUB_CPU_PARTIAL
+ nr_partial = s->cpu_partial;
+#endif
+
+ return sysfs_emit(buf, "%u\n", nr_partial);
}
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -5450,12 +5475,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
- if (page) {
+ if (page)
pages += page->pages;
- objects += page->pobjects;
- }
}
+ /* Approximate half-full pages , see slub_set_cpu_partial() */
+ objects = (pages * oo_objects(s->oo)) / 2;
len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages);
#ifdef CONFIG_SMP
@@ -5463,9 +5488,12 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
struct page *page;
page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
- if (page)
+ if (page) {
+ pages = READ_ONCE(page->pages);
+ objects = (pages * oo_objects(s->oo)) / 2;
len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
- cpu, page->pobjects, page->pages);
+ cpu, objects, pages);
+ }
}
#endif
len += sysfs_emit_at(buf, len, "\n");
@@ -6108,9 +6136,14 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
struct kmem_cache *s = file_inode(filep)->i_private;
unsigned long *obj_map;
+ if (!t)
+ return -ENOMEM;
+
obj_map = bitmap_alloc(oo_objects(s->oo), GFP_KERNEL);
- if (!obj_map)
+ if (!obj_map) {
+ seq_release_private(inode, filep);
return -ENOMEM;
+ }
if (strcmp(filep->f_path.dentry->d_name.name, "alloc_traces") == 0)
alloc = TRACK_ALLOC;
@@ -6119,6 +6152,7 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
if (!alloc_loc_track(t, PAGE_SIZE / sizeof(struct location), GFP_KERNEL)) {
bitmap_free(obj_map);
+ seq_release_private(inode, filep);
return -ENOMEM;
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index bdce883f9286..db6df27c852a 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -76,7 +76,7 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
set_pte_at(&init_mm, addr, pte, entry);
}
- /* Make pte visible before pmd. See comment in __pte_alloc(). */
+ /* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb();
pmd_populate_kernel(&init_mm, pmd, pgtable);
diff --git a/mm/sparse.c b/mm/sparse.c
index 120bc8ea5293..e5c84b0cf0c9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata;
static inline void __meminit sparse_buffer_free(unsigned long size)
{
WARN_ON(!sparsemap_buf || size == 0);
- memblock_free_early(__pa(sparsemap_buf), size);
+ memblock_free(sparsemap_buf, size);
}
static void __init sparse_buffer_init(unsigned long size, int nid)
diff --git a/mm/swap.c b/mm/swap.c
index af3cad4e5378..1841c24682f8 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
+ struct folio *folio = page_folio(page);
struct lruvec *lruvec;
unsigned long flags;
- lruvec = lock_page_lruvec_irqsave(page, &flags);
+ lruvec = folio_lruvec_lock_irqsave(folio, &flags);
del_page_from_lru_list(page, lruvec);
__clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
@@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
- mem_cgroup_uncharge(page);
+ mem_cgroup_uncharge(page_folio(page));
free_unref_page(page, 0);
}
@@ -134,18 +135,27 @@ EXPORT_SYMBOL(__put_page);
* put_pages_list() - release a list of pages
* @pages: list of pages threaded on page->lru
*
- * Release a list of pages which are strung together on page.lru. Currently
- * used by read_cache_pages() and related error recovery code.
+ * Release a list of pages which are strung together on page.lru.
*/
void put_pages_list(struct list_head *pages)
{
- while (!list_empty(pages)) {
- struct page *victim;
+ struct page *page, *next;
- victim = lru_to_page(pages);
- list_del(&victim->lru);
- put_page(victim);
+ list_for_each_entry_safe(page, next, pages, lru) {
+ if (!put_page_testzero(page)) {
+ list_del(&page->lru);
+ continue;
+ }
+ if (PageHead(page)) {
+ list_del(&page->lru);
+ __put_compound_page(page);
+ continue;
+ }
+ /* Cannot be PageLRU because it's passed to us using the lru */
+ __ClearPageWaiters(page);
}
+
+ free_unref_page_list(pages);
}
EXPORT_SYMBOL(put_pages_list);
@@ -188,12 +198,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
+ struct folio *folio = page_folio(page);
/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;
- lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
+ lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
(*move_fn)(page, lruvec);
SetPageLRU(page);
@@ -206,11 +217,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
{
- if (!PageUnevictable(page)) {
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- add_page_to_lru_list_tail(page, lruvec);
- __count_vm_events(PGROTATED, thp_nr_pages(page));
+ struct folio *folio = page_folio(page);
+
+ if (!folio_test_unevictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ lruvec_add_folio_tail(lruvec, folio);
+ __count_vm_events(PGROTATED, folio_nr_pages(folio));
}
}
@@ -227,23 +240,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
}
/*
- * Writeback is about to end against a page which has been marked for immediate
- * reclaim. If it still appears to be reclaimable, move it to the tail of the
- * inactive list.
+ * Writeback is about to end against a folio which has been marked for
+ * immediate reclaim. If it still appears to be reclaimable, move it
+ * to the tail of the inactive list.
*
- * rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
+ * folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
*/
-void rotate_reclaimable_page(struct page *page)
+void folio_rotate_reclaimable(struct folio *folio)
{
- if (!PageLocked(page) && !PageDirty(page) &&
- !PageUnevictable(page) && PageLRU(page)) {
+ if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
+ !folio_test_unevictable(folio) && folio_test_lru(folio)) {
struct pagevec *pvec;
unsigned long flags;
- get_page(page);
+ folio_get(folio);
local_lock_irqsave(&lru_rotate.lock, flags);
pvec = this_cpu_ptr(&lru_rotate.pvec);
- if (pagevec_add_and_need_flush(pvec, page))
+ if (pagevec_add_and_need_flush(pvec, &folio->page))
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
@@ -289,21 +302,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
-void lru_note_cost_page(struct page *page)
+void lru_note_cost_folio(struct folio *folio)
{
- lru_note_cost(mem_cgroup_page_lruvec(page),
- page_is_file_lru(page), thp_nr_pages(page));
+ lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
+ folio_nr_pages(folio));
}
-static void __activate_page(struct page *page, struct lruvec *lruvec)
+static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
{
- if (!PageActive(page) && !PageUnevictable(page)) {
- int nr_pages = thp_nr_pages(page);
+ if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec);
- SetPageActive(page);
- add_page_to_lru_list(page, lruvec);
- trace_mm_lru_activate(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_set_active(folio);
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
@@ -312,6 +325,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
}
#ifdef CONFIG_SMP
+static void __activate_page(struct page *page, struct lruvec *lruvec)
+{
+ return __folio_activate(page_folio(page), lruvec);
+}
+
static void activate_page_drain(int cpu)
{
struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
@@ -325,16 +343,16 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
-static void activate_page(struct page *page)
+static void folio_activate(struct folio *folio)
{
- page = compound_head(page);
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ if (folio_test_lru(folio) && !folio_test_active(folio) &&
+ !folio_test_unevictable(folio)) {
struct pagevec *pvec;
+ folio_get(folio);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
- get_page(page);
- if (pagevec_add_and_need_flush(pvec, page))
+ if (pagevec_add_and_need_flush(pvec, &folio->page))
pagevec_lru_move_fn(pvec, __activate_page);
local_unlock(&lru_pvecs.lock);
}
@@ -345,21 +363,20 @@ static inline void activate_page_drain(int cpu)
{
}
-static void activate_page(struct page *page)
+static void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
- page = compound_head(page);
- if (TestClearPageLRU(page)) {
- lruvec = lock_page_lruvec_irq(page);
- __activate_page(page, lruvec);
+ if (folio_test_clear_lru(folio)) {
+ lruvec = folio_lruvec_lock_irq(folio);
+ __folio_activate(folio, lruvec);
unlock_page_lruvec_irq(lruvec);
- SetPageLRU(page);
+ folio_set_lru(folio);
}
}
#endif
-static void __lru_cache_activate_page(struct page *page)
+static void __lru_cache_activate_folio(struct folio *folio)
{
struct pagevec *pvec;
int i;
@@ -380,8 +397,8 @@ static void __lru_cache_activate_page(struct page *page)
for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
struct page *pagevec_page = pvec->pages[i];
- if (pagevec_page == page) {
- SetPageActive(page);
+ if (pagevec_page == &folio->page) {
+ folio_set_active(folio);
break;
}
}
@@ -399,61 +416,59 @@ static void __lru_cache_activate_page(struct page *page)
* When a newly allocated page is not yet visible, so safe for non-atomic ops,
* __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
*/
-void mark_page_accessed(struct page *page)
+void folio_mark_accessed(struct folio *folio)
{
- page = compound_head(page);
-
- if (!PageReferenced(page)) {
- SetPageReferenced(page);
- } else if (PageUnevictable(page)) {
+ if (!folio_test_referenced(folio)) {
+ folio_set_referenced(folio);
+ } else if (folio_test_unevictable(folio)) {
/*
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
- } else if (!PageActive(page)) {
+ } else if (!folio_test_active(folio)) {
/*
* If the page is on the LRU, queue it for activation via
* lru_pvecs.activate_page. Otherwise, assume the page is on a
* pagevec, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
- if (PageLRU(page))
- activate_page(page);
+ if (folio_test_lru(folio))
+ folio_activate(folio);
else
- __lru_cache_activate_page(page);
- ClearPageReferenced(page);
- workingset_activation(page);
+ __lru_cache_activate_folio(folio);
+ folio_clear_referenced(folio);
+ workingset_activation(folio);
}
- if (page_is_idle(page))
- clear_page_idle(page);
+ if (folio_test_idle(folio))
+ folio_clear_idle(folio);
}
-EXPORT_SYMBOL(mark_page_accessed);
+EXPORT_SYMBOL(folio_mark_accessed);
/**
- * lru_cache_add - add a page to a page list
- * @page: the page to be added to the LRU.
+ * folio_add_lru - Add a folio to an LRU list.
+ * @folio: The folio to be added to the LRU.
*
- * Queue the page for addition to the LRU via pagevec. The decision on whether
+ * Queue the folio for addition to the LRU. The decision on whether
* to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of lru_cache_add()
- * have the page added to the active list using mark_page_accessed().
+ * pagevec is drained. This gives a chance for the caller of folio_add_lru()
+ * have the folio added to the active list using folio_mark_accessed().
*/
-void lru_cache_add(struct page *page)
+void folio_add_lru(struct folio *folio)
{
struct pagevec *pvec;
- VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
- get_page(page);
+ folio_get(folio);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- if (pagevec_add_and_need_flush(pvec, page))
+ if (pagevec_add_and_need_flush(pvec, &folio->page))
__pagevec_lru_add(pvec);
local_unlock(&lru_pvecs.lock);
}
-EXPORT_SYMBOL(lru_cache_add);
+EXPORT_SYMBOL(folio_add_lru);
/**
* lru_cache_add_inactive_or_unevictable
@@ -888,11 +903,12 @@ void release_pages(struct page **pages, int nr)
int i;
LIST_HEAD(pages_to_free);
struct lruvec *lruvec = NULL;
- unsigned long flags;
+ unsigned long flags = 0;
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
+ struct folio *folio = page_folio(page);
/*
* Make sure the IRQ-safe lock-holding time does not get
@@ -904,7 +920,7 @@ void release_pages(struct page **pages, int nr)
lruvec = NULL;
}
- page = compound_head(page);
+ page = &folio->page;
if (is_huge_zero_page(page))
continue;
@@ -943,7 +959,7 @@ void release_pages(struct page **pages, int nr)
if (PageLRU(page)) {
struct lruvec *prev_lruvec = lruvec;
- lruvec = relock_page_lruvec_irqsave(page, lruvec,
+ lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
&flags);
if (prev_lruvec != lruvec)
lock_batch = 0;
@@ -985,17 +1001,18 @@ void __pagevec_release(struct pagevec *pvec)
}
EXPORT_SYMBOL(__pagevec_release);
-static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
+static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
{
- int was_unevictable = TestClearPageUnevictable(page);
- int nr_pages = thp_nr_pages(page);
+ int was_unevictable = folio_test_clear_unevictable(folio);
+ long nr_pages = folio_nr_pages(folio);
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
- * Page becomes evictable in two ways:
+ * A folio becomes evictable in two ways:
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
- * 2) Before acquiring LRU lock to put the page to correct LRU and then
+ * 2) Before acquiring LRU lock to put the folio on the correct LRU
+ * and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
*
@@ -1004,35 +1021,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
*
* #0: __pagevec_lru_add_fn #1: clear_page_mlock
*
- * SetPageLRU() TestClearPageMlocked()
+ * folio_set_lru() folio_test_clear_mlocked()
* smp_mb() // explicit ordering // above provides strict
* // ordering
- * PageMlocked() PageLRU()
+ * folio_test_mlocked() folio_test_lru()
*
*
- * if '#1' does not observe setting of PG_lru by '#0' and fails
- * isolation, the explicit barrier will make sure that page_evictable
- * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
- * can be reordered after PageMlocked check and can make '#1' to fail
- * the isolation of the page whose Mlocked bit is cleared (#0 is also
- * looking at the same page) and the evictable page will be stranded
- * in an unevictable LRU.
+ * if '#1' does not observe setting of PG_lru by '#0' and
+ * fails isolation, the explicit barrier will make sure that
+ * folio_evictable check will put the folio on the correct
+ * LRU. Without smp_mb(), folio_set_lru() can be reordered
+ * after folio_test_mlocked() check and can make '#1' fail the
+ * isolation of the folio whose mlocked bit is cleared (#0 is
+ * also looking at the same folio) and the evictable folio will
+ * be stranded on an unevictable LRU.
*/
- SetPageLRU(page);
+ folio_set_lru(folio);
smp_mb__after_atomic();
- if (page_evictable(page)) {
+ if (folio_evictable(folio)) {
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
- ClearPageActive(page);
- SetPageUnevictable(page);
+ folio_clear_active(folio);
+ folio_set_unevictable(folio);
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
- add_page_to_lru_list(page, lruvec);
- trace_mm_lru_insertion(page);
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_insertion(folio);
}
/*
@@ -1046,10 +1064,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
+ struct folio *folio = page_folio(pvec->pages[i]);
- lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
- __pagevec_lru_add_fn(page, lruvec);
+ lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
+ __pagevec_lru_add_fn(folio, lruvec);
}
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index bc7cee6b2ec5..8d4104242100 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
- workingset_refault(page, shadow);
+ workingset_refault(page_folio(page), shadow);
/* Caller will initiate read into locked page */
lru_cache_add(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 22d10f713848..e59e08ef46e1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -18,7 +18,7 @@
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
-#include <linux/blkdev.h>
+#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
@@ -2763,7 +2763,7 @@ static int swap_show(struct seq_file *swap, void *v)
struct swap_info_struct *si = v;
struct file *file;
int len;
- unsigned int bytes, inuse;
+ unsigned long bytes, inuse;
if (si == SEQ_START_TOKEN) {
seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
@@ -2775,7 +2775,7 @@ static int swap_show(struct seq_file *swap, void *v)
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
- seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
+ seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
@@ -3118,7 +3118,7 @@ static bool swap_discardable(struct swap_info_struct *si)
{
struct request_queue *q = bdev_get_queue(si->bdev);
- if (!q || !blk_queue_discard(q))
+ if (!blk_queue_discard(q))
return false;
return true;
@@ -3534,13 +3534,13 @@ struct swap_info_struct *page_swap_info(struct page *page)
}
/*
- * out-of-line __page_file_ methods to avoid include hell.
+ * out-of-line methods to avoid include hell.
*/
-struct address_space *__page_file_mapping(struct page *page)
+struct address_space *swapcache_mapping(struct folio *folio)
{
- return page_swap_info(page)->swap_file->f_mapping;
+ return page_swap_info(&folio->page)->swap_file->f_mapping;
}
-EXPORT_SYMBOL_GPL(__page_file_mapping);
+EXPORT_SYMBOL_GPL(swapcache_mapping);
pgoff_t __page_file_index(struct page *page)
{
diff --git a/mm/truncate.c b/mm/truncate.c
index 714eaf19821d..cc83a3f7c1ad 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -45,9 +45,13 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
}
/*
@@ -73,8 +77,10 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
return;
dax = dax_mapping(mapping);
- if (!dax)
+ if (!dax) {
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
+ }
for (i = j; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
@@ -93,8 +99,12 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
__clear_shadow_entry(mapping, index, page);
}
- if (!dax)
+ if (!dax) {
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
+ }
pvec->nr = j;
}
@@ -567,6 +577,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
return 0;
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
if (PageDirty(page))
goto failed;
@@ -574,6 +585,9 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
@@ -582,6 +596,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 7a9008415534..0780c2a57ff1 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pgoff_t offset, max_off;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
+ _dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
- if (writable || !page_in_cache)
- _dst_pte = pte_mkdirty(_dst_pte);
if (writable) {
if (wp_copy)
_dst_pte = pte_mkuffd_wp(_dst_pte);
@@ -164,7 +163,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
goto out_release;
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
@@ -233,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
+ if (PageHWPoison(page)) {
+ ret = -EIO;
+ goto out_release;
+ }
+
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
page, false, wp_copy);
if (ret)
diff --git a/mm/util.c b/mm/util.c
index bacabe446906..e58151a61255 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -654,81 +654,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
}
EXPORT_SYMBOL(kvrealloc);
-static inline void *__page_rmapping(struct page *page)
-{
- unsigned long mapping;
-
- mapping = (unsigned long)page->mapping;
- mapping &= ~PAGE_MAPPING_FLAGS;
-
- return (void *)mapping;
-}
-
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
- page = compound_head(page);
- return __page_rmapping(page);
+ return folio_raw_mapping(page_folio(page));
}
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
+/**
+ * folio_mapped - Is this folio mapped into userspace?
+ * @folio: The folio.
+ *
+ * Return: True if any page in this folio is referenced by user page tables.
*/
-bool page_mapped(struct page *page)
+bool folio_mapped(struct folio *folio)
{
- int i;
+ long i, nr;
- if (likely(!PageCompound(page)))
- return atomic_read(&page->_mapcount) >= 0;
- page = compound_head(page);
- if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+ if (folio_test_single(folio))
+ return atomic_read(&folio->_mapcount) >= 0;
+ if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
return true;
- if (PageHuge(page))
+ if (folio_test_hugetlb(folio))
return false;
- for (i = 0; i < compound_nr(page); i++) {
- if (atomic_read(&page[i]._mapcount) >= 0)
+
+ nr = folio_nr_pages(folio);
+ for (i = 0; i < nr; i++) {
+ if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
return true;
}
return false;
}
-EXPORT_SYMBOL(page_mapped);
+EXPORT_SYMBOL(folio_mapped);
struct anon_vma *page_anon_vma(struct page *page)
{
- unsigned long mapping;
+ struct folio *folio = page_folio(page);
+ unsigned long mapping = (unsigned long)folio->mapping;
- page = compound_head(page);
- mapping = (unsigned long)page->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
return NULL;
- return __page_rmapping(page);
+ return (void *)(mapping - PAGE_MAPPING_ANON);
}
-struct address_space *page_mapping(struct page *page)
+/**
+ * folio_mapping - Find the mapping where this folio is stored.
+ * @folio: The folio.
+ *
+ * For folios which are in the page cache, return the mapping that this
+ * page belongs to. Folios in the swap cache return the swap mapping
+ * this page is stored in (which is different from the mapping for the
+ * swap file or swap device where the data is stored).
+ *
+ * You can call this for folios which aren't in the swap cache or page
+ * cache and it will return NULL.
+ */
+struct address_space *folio_mapping(struct folio *folio)
{
struct address_space *mapping;
- page = compound_head(page);
-
/* This happens if someone calls flush_dcache_page on slab page */
- if (unlikely(PageSlab(page)))
+ if (unlikely(folio_test_slab(folio)))
return NULL;
- if (unlikely(PageSwapCache(page))) {
- swp_entry_t entry;
+ if (unlikely(folio_test_swapcache(folio)))
+ return swap_address_space(folio_swap_entry(folio));
- entry.val = page_private(page);
- return swap_address_space(entry);
- }
-
- mapping = page->mapping;
+ mapping = folio->mapping;
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
-EXPORT_SYMBOL(page_mapping);
+EXPORT_SYMBOL(folio_mapping);
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
@@ -750,13 +747,26 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
-void copy_huge_page(struct page *dst, struct page *src)
+/**
+ * folio_copy - Copy the contents of one folio to another.
+ * @dst: Folio to copy to.
+ * @src: Folio to copy from.
+ *
+ * The bytes in the folio represented by @src are copied to @dst.
+ * Assumes the caller has validated that @dst is at least as large as @src.
+ * Can be called in atomic context for order-0 folios, but if the folio is
+ * larger, it may sleep.
+ */
+void folio_copy(struct folio *dst, struct folio *src)
{
- unsigned i, nr = compound_nr(src);
+ long i = 0;
+ long nr = folio_nr_pages(src);
- for (i = 0; i < nr; i++) {
+ for (;;) {
+ copy_highpage(folio_page(dst, i), folio_page(src, i));
+ if (++i == nr)
+ break;
cond_resched();
- copy_highpage(nth_page(dst, i), nth_page(src, i));
}
}
@@ -1079,3 +1089,14 @@ void page_offline_end(void)
up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);
+
+#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
+void flush_dcache_folio(struct folio *folio)
+{
+ long i, nr = folio_nr_pages(folio);
+
+ for (i = 0; i < nr; i++)
+ flush_dcache_page(folio_page(folio, i));
+}
+EXPORT_SYMBOL(flush_dcache_folio);
+#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d77830ff604c..d2a00ad4e1dd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1195,18 +1195,14 @@ find_vmap_lowest_match(unsigned long size,
{
struct vmap_area *va;
struct rb_node *node;
- unsigned long length;
/* Start from the root. */
node = free_vmap_area_root.rb_node;
- /* Adjust the search size for alignment overhead. */
- length = size + align - 1;
-
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
- if (get_subtree_max_size(node->rb_left) >= length &&
+ if (get_subtree_max_size(node->rb_left) >= size &&
vstart < va->va_start) {
node = node->rb_left;
} else {
@@ -1216,9 +1212,9 @@ find_vmap_lowest_match(unsigned long size,
/*
* Does not make sense to go deeper towards the right
* sub-tree if it does not have a free block that is
- * equal or bigger to the requested search length.
+ * equal or bigger to the requested search size.
*/
- if (get_subtree_max_size(node->rb_right) >= length) {
+ if (get_subtree_max_size(node->rb_right) >= size) {
node = node->rb_right;
continue;
}
@@ -1226,15 +1222,23 @@ find_vmap_lowest_match(unsigned long size,
/*
* OK. We roll back and find the first right sub-tree,
* that will satisfy the search criteria. It can happen
- * only once due to "vstart" restriction.
+ * due to "vstart" restriction or an alignment overhead
+ * that is bigger then PAGE_SIZE.
*/
while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node);
if (is_within_this_va(va, size, align, vstart))
return va;
- if (get_subtree_max_size(node->rb_right) >= length &&
+ if (get_subtree_max_size(node->rb_right) >= size &&
vstart <= va->va_start) {
+ /*
+ * Shift the vstart forward. Please note, we update it with
+ * parent's start address adding "1" because we do not want
+ * to enter same sub-tree after it has already been checked
+ * and no suitable free block found there.
+ */
+ vstart = va->va_start + 1;
node = node->rb_right;
break;
}
@@ -1265,7 +1269,7 @@ find_vmap_lowest_linear_match(unsigned long size,
}
static void
-find_vmap_lowest_match_check(unsigned long size)
+find_vmap_lowest_match_check(unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
@@ -1274,8 +1278,8 @@ find_vmap_lowest_match_check(unsigned long size)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
- va_1 = find_vmap_lowest_match(size, 1, vstart);
- va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+ va_1 = find_vmap_lowest_match(size, align, vstart);
+ va_2 = find_vmap_lowest_linear_match(size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@@ -1454,7 +1458,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
- find_vmap_lowest_match_check(size);
+ find_vmap_lowest_match_check(size, align);
#endif
return nva_start_addr;
@@ -2272,15 +2276,22 @@ void __init vm_area_add_early(struct vm_struct *vm)
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
- static size_t vm_init_off __initdata;
- unsigned long addr;
+ unsigned long addr = ALIGN(VMALLOC_START, align);
+ struct vm_struct *cur, **p;
- addr = ALIGN(VMALLOC_START + vm_init_off, align);
- vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
+ BUG_ON(vmap_initialized);
- vm->addr = (void *)addr;
+ for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
+ if ((unsigned long)cur->addr - addr >= vm->size)
+ break;
+ addr = ALIGN((unsigned long)cur->addr + cur->size, align);
+ }
- vm_area_add_early(vm);
+ BUG_ON(addr > VMALLOC_END - vm->size);
+ vm->addr = (void *)addr;
+ vm->next = *p;
+ *p = vm;
+ kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
static void vmap_init_free_space(void)
@@ -2743,6 +2754,13 @@ void *vmap(struct page **pages, unsigned int count,
might_sleep();
+ /*
+ * Your top guard is someone else's bottom guard. Not having a top
+ * guard compromises someone else's mappings too.
+ */
+ if (WARN_ON_ONCE(flags & VM_NO_GUARD))
+ flags &= ~VM_NO_GUARD;
+
if (count > totalram_pages())
return NULL;
@@ -2816,6 +2834,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
+ struct page *page;
+ int i;
/*
* For order-0 pages we make use of bulk allocator, if
@@ -2835,8 +2855,20 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
*/
nr_pages_request = min(100U, nr_pages - nr_allocated);
- nr = alloc_pages_bulk_array_node(gfp, nid,
- nr_pages_request, pages + nr_allocated);
+ /* memory allocation should consider mempolicy, we can't
+ * wrongly use nearest node when nid == NUMA_NO_NODE,
+ * otherwise memory may be allocated in only one node,
+ * but mempolcy want to alloc memory by interleaving.
+ */
+ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
+ nr = alloc_pages_bulk_array_mempolicy(gfp,
+ nr_pages_request,
+ pages + nr_allocated);
+
+ else
+ nr = alloc_pages_bulk_array_node(gfp, nid,
+ nr_pages_request,
+ pages + nr_allocated);
nr_allocated += nr;
cond_resched();
@@ -2856,11 +2888,15 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
gfp |= __GFP_COMP;
/* High-order pages or fallback path if "bulk" fails. */
+
while (nr_allocated < nr_pages) {
- struct page *page;
- int i;
+ if (fatal_signal_pending(current))
+ break;
- page = alloc_pages_node(nid, gfp, order);
+ if (nid == NUMA_NO_NODE)
+ page = alloc_pages(gfp, order);
+ else
+ page = alloc_pages_node(nid, gfp, order);
if (unlikely(!page))
break;
@@ -2884,6 +2920,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
+ const gfp_t orig_gfp_mask = gfp_mask;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
@@ -2904,7 +2941,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (!area->pages) {
- warn_alloc(gfp_mask, NULL,
+ warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
@@ -2924,7 +2961,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
- warn_alloc(gfp_mask, NULL,
+ warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
@@ -2932,7 +2969,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
if (vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift) < 0) {
- warn_alloc(gfp_mask, NULL,
+ warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
@@ -2958,8 +2995,16 @@ fail:
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * allocator with @gfp_mask flags. Please note that the full set of gfp
+ * flags are not supported. GFP_KERNEL would be a preferred allocation mode
+ * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not
+ * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka
+ * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka
+ * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported).
+ * __GFP_NOWARN can be used to suppress error messages about failures.
+ *
+ * Map them into contiguous kernel virtual space, using a pagetable
+ * protection of @prot.
*
* Return: the address of the area or %NULL on failure
*/
@@ -3853,6 +3898,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
+ unsigned int step = 1U << vm_area_page_order(v);
if (!counters)
return;
@@ -3864,9 +3910,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
- for (nr = 0; nr < v->nr_pages; nr++)
- counters[page_to_nid(v->pages[nr])]++;
-
+ for (nr = 0; nr < v->nr_pages; nr += step)
+ counters[page_to_nid(v->pages[nr])] += step;
for_each_node_state(nr, N_HIGH_MEMORY)
if (counters[nr])
seq_printf(m, " N%u=%u", nr, counters[nr]);
@@ -3902,7 +3947,7 @@ static int s_show(struct seq_file *m, void *p)
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
- return 0;
+ goto final;
}
v = va->vm;
@@ -3943,6 +3988,7 @@ static int s_show(struct seq_file *m, void *p)
/*
* As a final step, dump "unpurged" areas.
*/
+final:
if (list_is_last(&va->list, &vmap_area_list))
show_purge_info(m);
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 76518e4166dc..b52644771cc4 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -308,7 +308,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* asserted for a second in which subsequent
* pressure events can occur.
*/
- memcg->socket_pressure = jiffies + HZ;
+ WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
}
}
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 74296c2d1fed..fb9584641ac7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -687,6 +687,21 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
+/**
+ * synchronize_shrinkers - Wait for all running shrinkers to complete.
+ *
+ * This is equivalent to calling unregister_shrink() and register_shrinker(),
+ * but atomically and with less overhead. This is useful to guarantee that all
+ * shrinker invocations have seen an update, before freeing memory, similar to
+ * rcu.
+ */
+void synchronize_shrinkers(void)
+{
+ down_write(&shrinker_rwsem);
+ up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL(synchronize_shrinkers);
+
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
@@ -1006,6 +1021,91 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
+void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
+{
+ wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
+ long timeout, ret;
+ DEFINE_WAIT(wait);
+
+ /*
+ * Do not throttle IO workers, kthreads other than kswapd or
+ * workqueues. They may be required for reclaim to make
+ * forward progress (e.g. journalling workqueues or kthreads).
+ */
+ if (!current_is_kswapd() &&
+ current->flags & (PF_IO_WORKER|PF_KTHREAD))
+ return;
+
+ /*
+ * These figures are pulled out of thin air.
+ * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
+ * parallel reclaimers which is a short-lived event so the timeout is
+ * short. Failing to make progress or waiting on writeback are
+ * potentially long-lived events so use a longer timeout. This is shaky
+ * logic as a failure to make progress could be due to anything from
+ * writeback to a slow device to excessive references pages at the tail
+ * of the inactive LRU.
+ */
+ switch(reason) {
+ case VMSCAN_THROTTLE_WRITEBACK:
+ timeout = HZ/10;
+
+ if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
+ WRITE_ONCE(pgdat->nr_reclaim_start,
+ node_page_state(pgdat, NR_THROTTLED_WRITTEN));
+ }
+
+ break;
+ case VMSCAN_THROTTLE_NOPROGRESS:
+ timeout = HZ/2;
+ break;
+ case VMSCAN_THROTTLE_ISOLATED:
+ timeout = HZ/50;
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ timeout = HZ;
+ break;
+ }
+
+ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+ ret = schedule_timeout(timeout);
+ finish_wait(wqh, &wait);
+
+ if (reason == VMSCAN_THROTTLE_WRITEBACK)
+ atomic_dec(&pgdat->nr_writeback_throttled);
+
+ trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
+ jiffies_to_usecs(timeout - ret),
+ reason);
+}
+
+/*
+ * Account for pages written if tasks are throttled waiting on dirty
+ * pages to clean. If enough pages have been cleaned since throttling
+ * started then wakeup the throttled tasks.
+ */
+void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
+ int nr_throttled)
+{
+ unsigned long nr_written;
+
+ node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
+
+ /*
+ * This is an inaccurate read as the per-cpu deltas may not
+ * be synchronised. However, given that the system is
+ * writeback throttled, it is not worth taking the penalty
+ * of getting an accurate count. At worst, the throttle
+ * timeout guarantees forward progress.
+ */
+ nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
+ READ_ONCE(pgdat->nr_reclaim_start);
+
+ if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
+ wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
+}
+
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@@ -1105,6 +1205,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
+ if (!PageSwapCache(page))
+ spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
@@ -1173,6 +1275,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
if (freepage != NULL)
freepage(page);
@@ -1182,6 +1287,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
xa_unlock_irq(&mapping->i_pages);
+ if (!PageSwapCache(page))
+ spin_unlock(&mapping->host->i_lock);
return 0;
}
@@ -1337,7 +1444,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
- int err;
if (list_empty(demote_pages))
return 0;
@@ -1346,7 +1452,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return 0;
/* Demotion ignores all cpuset and mempolicy settings */
- err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+ migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@@ -1412,9 +1518,8 @@ retry:
/*
* The number of dirty pages determines if a node is marked
- * reclaim_congested which affects wait_iff_congested. kswapd
- * will stall and start writing pages if the tail of the LRU
- * is all dirty unqueued pages.
+ * reclaim_congested. kswapd will stall and start writing
+ * pages if the tail of the LRU is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
@@ -2090,6 +2195,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*/
int isolate_lru_page(struct page *page)
{
+ struct folio *folio = page_folio(page);
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2099,7 +2205,7 @@ int isolate_lru_page(struct page *page)
struct lruvec *lruvec;
get_page(page);
- lruvec = lock_page_lruvec_irq(page);
+ lruvec = folio_lruvec_lock_irq(folio);
del_page_from_lru_list(page, lruvec);
unlock_page_lruvec_irq(lruvec);
ret = 0;
@@ -2119,6 +2225,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
+ bool too_many;
if (current_is_kswapd())
return 0;
@@ -2142,7 +2249,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
- return isolated > inactive;
+ too_many = isolated > inactive;
+
+ /* Wake up tasks throttled due to too_many_isolated. */
+ if (!too_many)
+ wake_throttle_isolated(pgdat);
+
+ return too_many;
}
/*
@@ -2199,7 +2312,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
* All pages were isolated from the same lruvec (and isolation
* inhibits memcg migration).
*/
- VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
+ VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
@@ -2251,8 +2364,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
/* wait a bit for the reclaimer. */
- msleep(100);
stalled = true;
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@@ -3180,19 +3293,19 @@ again:
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
- * faster than they are written so also forcibly stall.
+ * faster than they are written so forcibly stall
+ * until some pages complete writeback.
*/
if (sc->nr.immediate)
- congestion_wait(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/*
- * Tag a node/memcg as congested if all the dirty pages
- * scanned were backed by a congested BDI and
- * wait_iff_congested will stall.
+ * Tag a node/memcg as congested if all the dirty pages were marked
+ * for writeback and immediate reclaim (counted in nr.congested).
*
* Legacy memcg will stall in page writeback so avoid forcibly
- * stalling in wait_iff_congested().
+ * stalling in reclaim_throttle().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
@@ -3200,15 +3313,15 @@ again:
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
- * Stall direct reclaim for IO completions if underlying BDIs
- * and node is congested. Allow kswapd to continue until it
+ * Stall direct reclaim for IO completions if the lruvec is
+ * node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
- wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
@@ -3256,6 +3369,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
+static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
+{
+ /*
+ * If reclaim is making progress greater than 12% efficiency then
+ * wake all the NOPROGRESS throttled tasks.
+ */
+ if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
+ wait_queue_head_t *wqh;
+
+ wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
+ if (waitqueue_active(wqh))
+ wake_up(wqh);
+
+ return;
+ }
+
+ /*
+ * Do not throttle kswapd on NOPROGRESS as it will throttle on
+ * VMSCAN_THROTTLE_WRITEBACK if there are too many pages under
+ * writeback and marked for immediate reclaim at the tail of
+ * the LRU.
+ */
+ if (current_is_kswapd())
+ return;
+
+ /* Throttle if making no progress at high prioities. */
+ if (sc->priority < DEF_PRIORITY - 2)
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
+}
+
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@@ -3340,6 +3483,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
continue;
last_pgdat = zone->zone_pgdat;
shrink_node(zone->zone_pgdat, sc);
+ consider_reclaim_throttle(zone->zone_pgdat, sc);
}
/*
@@ -4286,6 +4430,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
+ atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
bool ret;
@@ -4665,6 +4810,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
+ struct folio *folio = page_folio(page);
int nr_pages;
if (PageTransTail(page))
@@ -4677,7 +4823,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
if (!TestClearPageLRU(page))
continue;
- lruvec = relock_page_lruvec_irq(page, lruvec);
+ lruvec = folio_lruvec_relock_irq(folio, lruvec);
if (page_evictable(page) && PageUnevictable(page)) {
del_page_from_lru_list(page, lruvec);
ClearPageUnevictable(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8ce2620344b2..d701c335628c 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
EXPORT_SYMBOL(vm_node_stat);
+#ifdef CONFIG_NUMA
+static void fold_vm_zone_numa_events(struct zone *zone)
+{
+ unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
+ int cpu;
+ enum numa_stat_item item;
+
+ for_each_online_cpu(cpu) {
+ struct per_cpu_zonestat *pzstats;
+
+ pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
+ }
+
+ for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
+ zone_numa_event_add(zone_numa_events[item], zone, item);
+}
+
+void fold_vm_numa_events(void)
+{
+ struct zone *zone;
+
+ for_each_populated_zone(zone)
+ fold_vm_zone_numa_events(zone);
+}
+#endif
+
#ifdef CONFIG_SMP
int calculate_pressure_threshold(struct zone *zone)
@@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff)
return changes;
}
-#ifdef CONFIG_NUMA
-static void fold_vm_zone_numa_events(struct zone *zone)
-{
- unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
- int cpu;
- enum numa_stat_item item;
-
- for_each_online_cpu(cpu) {
- struct per_cpu_zonestat *pzstats;
-
- pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
- for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
- zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
- }
-
- for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
- zone_numa_event_add(zone_numa_events[item], zone, item);
-}
-
-void fold_vm_numa_events(void)
-{
- struct zone *zone;
-
- for_each_populated_zone(zone)
- fold_vm_zone_numa_events(zone);
-}
-#endif
-
/*
* Update the zone counters for the current cpu.
*
@@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone,
for (order = 0; order < MAX_ORDER; order++) {
unsigned long blocks;
- /* Count number of free blocks */
- blocks = zone->free_area[order].nr_free;
+ /*
+ * Count number of free blocks.
+ *
+ * Access to nr_free is lockless as nr_free is used only for
+ * diagnostic purposes. Use data_race to avoid KCSAN warning.
+ */
+ blocks = data_race(zone->free_area[order].nr_free);
info->free_blocks_total += blocks;
/* Count free base pages */
@@ -1225,6 +1230,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
+ "nr_throttled_written",
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
@@ -1445,7 +1451,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
- seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+ /*
+ * Access to nr_free is lockless as nr_free is used only for
+ * printing purposes. Use data_race to avoid KCSAN warning.
+ */
+ seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
seq_putc(m, '\n');
}
@@ -1656,6 +1666,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n pages free %lu"
+ "\n boost %lu"
"\n min %lu"
"\n low %lu"
"\n high %lu"
@@ -1664,6 +1675,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n managed %lu"
"\n cma %lu",
zone_page_state(zone, NR_FREE_PAGES),
+ zone->watermark_boost,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@@ -2179,7 +2191,7 @@ static void extfrag_show_print(struct seq_file *m,
for (order = 0; order < MAX_ORDER; ++order) {
fill_contig_page_info(zone, order, &info);
index = __fragmentation_index(order, &info);
- seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
+ seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
}
seq_putc(m, '\n');
diff --git a/mm/workingset.c b/mm/workingset.c
index d5b81e4f4cbe..8c03afe1d67c 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
}
/**
- * workingset_refault - evaluate the refault of a previously evicted page
- * @page: the freshly allocated replacement page
- * @shadow: shadow entry of the evicted page
+ * workingset_refault - Evaluate the refault of a previously evicted folio.
+ * @folio: The freshly allocated replacement folio.
+ * @shadow: Shadow entry of the evicted folio.
*
* Calculates and evaluates the refault distance of the previously
- * evicted page in the context of the node and the memcg whose memory
+ * evicted folio in the context of the node and the memcg whose memory
* pressure caused the eviction.
*/
-void workingset_refault(struct page *page, void *shadow)
+void workingset_refault(struct folio *folio, void *shadow)
{
- bool file = page_is_file_lru(page);
+ bool file = folio_is_file_lru(folio);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
@@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow)
unsigned long refault;
bool workingset;
int memcgid;
+ long nr;
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
- * have been deleted since the page's eviction.
+ * have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
- * for a new cgroup that refaults a shared page. This is
+ * for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
@@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
- * The activation decision for this page is made at the level
+ * The activation decision for this folio is made at the level
* where the eviction occurred, as that is where the LRU order
- * during page reclaim is being determined.
+ * during folio reclaim is being determined.
*
- * However, the cgroup that will own the page is the one that
+ * However, the cgroup that will own the folio is the one that
* is actually experiencing the refault event.
*/
- memcg = page_memcg(page);
+ nr = folio_nr_pages(folio);
+ memcg = folio_memcg(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
+ mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
mem_cgroup_flush_stats();
/*
@@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow)
if (refault_distance > workingset_size)
goto out;
- SetPageActive(page);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
+ folio_set_active(folio);
+ workingset_age_nonresident(lruvec, nr);
+ mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
- /* Page was active prior to eviction */
+ /* Folio was active prior to eviction */
if (workingset) {
- SetPageWorkingset(page);
+ folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
- lru_note_cost_page(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
+ lru_note_cost_folio(folio);
+ mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
rcu_read_unlock();
@@ -393,12 +395,11 @@ out:
/**
* workingset_activation - note a page activation
- * @page: page that is being activated
+ * @folio: Folio that is being activated.
*/
-void workingset_activation(struct page *page)
+void workingset_activation(struct folio *folio)
{
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
rcu_read_lock();
/*
@@ -408,11 +409,10 @@ void workingset_activation(struct page *page)
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
- memcg = page_memcg_rcu(page);
+ memcg = folio_memcg_rcu(folio);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- lruvec = mem_cgroup_page_lruvec(page);
- workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
out:
rcu_read_unlock();
}
@@ -543,6 +543,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
+ if (!spin_trylock(&mapping->host->i_lock)) {
+ xa_unlock(&mapping->i_pages);
+ spin_unlock_irq(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
list_lru_isolate(lru, item);
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
@@ -562,6 +569,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
out_invalid:
xa_unlock_irq(&mapping->i_pages);
+ if (mapping_shrinkable(mapping))
+ inode_add_lru(mapping->host);
+ spin_unlock(&mapping->host->i_lock);
ret = LRU_REMOVED_RETRY;
out:
cond_resched();
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 68e8831068f4..b897ce3b399a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1830,10 +1830,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool)
VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
atomic_long_dec(&pool->isolated_pages);
/*
- * There's no possibility of racing, since wait_for_isolated_drain()
- * checks the isolated count under &class->lock after enqueuing
- * on migration_wait.
+ * Checking pool->destroying must happen after atomic_long_dec()
+ * for pool->isolated_pages above. Paired with the smp_mb() in
+ * zs_unregister_migration().
*/
+ smp_mb__after_atomic();
if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
wake_up_all(&pool->migration_wait);
}