summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig15
-rw-r--r--mm/Makefile1
-rw-r--r--mm/balloon_compaction.c10
-rw-r--r--mm/cma_debug.c2
-rw-r--r--mm/compaction.c39
-rw-r--r--mm/damon/Kconfig8
-rw-r--r--mm/damon/Makefile1
-rw-r--r--mm/damon/dbgfs.c79
-rw-r--r--mm/damon/lru_sort.c548
-rw-r--r--mm/damon/ops-common.c42
-rw-r--r--mm/damon/ops-common.h2
-rw-r--r--mm/damon/paddr.c60
-rw-r--r--mm/damon/reclaim.c50
-rw-r--r--mm/damon/sysfs.c69
-rw-r--r--mm/damon/vaddr.c3
-rw-r--r--mm/debug_vm_pgtable.c2
-rw-r--r--mm/filemap.c176
-rw-r--r--mm/folio-compat.c22
-rw-r--r--mm/frontswap.c2
-rw-r--r--mm/gup.c96
-rw-r--r--mm/gup_test.c2
-rw-r--r--mm/highmem.c4
-rw-r--r--mm/hmm.c19
-rw-r--r--mm/huge_memory.c198
-rw-r--r--mm/hugetlb.c193
-rw-r--r--mm/hugetlb_cgroup.c1
-rw-r--r--mm/hugetlb_vmemmap.c68
-rw-r--r--mm/hwpoison-inject.c2
-rw-r--r--mm/internal.h21
-rw-r--r--mm/ioremap.c26
-rw-r--r--mm/kasan/common.c11
-rw-r--r--mm/kasan/hw_tags.c32
-rw-r--r--mm/kasan/kasan.h3
-rw-r--r--mm/kasan/report.c12
-rw-r--r--mm/kasan/shadow.c29
-rw-r--r--mm/kfence/core.c29
-rw-r--r--mm/khugepaged.c230
-rw-r--r--mm/kmemleak.c260
-rw-r--r--mm/ksm.c12
-rw-r--r--mm/list_lru.c2
-rw-r--r--mm/madvise.c16
-rw-r--r--mm/memblock.c46
-rw-r--r--mm/memcontrol.c226
-rw-r--r--mm/memory-failure.c344
-rw-r--r--mm/memory.c63
-rw-r--r--mm/memory_hotplug.c57
-rw-r--r--mm/mempolicy.c10
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/memremap.c22
-rw-r--r--mm/migrate.c282
-rw-r--r--mm/migrate_device.c83
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c53
-rw-r--r--mm/mprotect.c81
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page-writeback.c89
-rw-r--r--mm/page_alloc.c472
-rw-r--r--mm/page_isolation.c2
-rw-r--r--mm/page_vma_mapped.c5
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/readahead.c2
-rw-r--r--mm/rmap.c141
-rw-r--r--mm/secretmem.c55
-rw-r--r--mm/shmem.c106
-rw-r--r--mm/shrinker_debug.c286
-rw-r--r--mm/slab.c30
-rw-r--r--mm/slab.h39
-rw-r--r--mm/slab_common.c36
-rw-r--r--mm/slob.c33
-rw-r--r--mm/slub.c141
-rw-r--r--mm/sparse-vmemmap.c20
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/swap.c630
-rw-r--r--mm/swap.h19
-rw-r--r--mm/swap_slots.c2
-rw-r--r--mm/swap_state.c60
-rw-r--r--mm/swapfile.c31
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/usercopy.c2
-rw-r--r--mm/userfaultfd.c5
-rw-r--r--mm/util.c6
-rw-r--r--mm/vmalloc.c148
-rw-r--r--mm/vmscan.c388
-rw-r--r--mm/workingset.c2
-rw-r--r--mm/z3fold.c84
-rw-r--r--mm/zsmalloc.c118
86 files changed, 4146 insertions, 2486 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b4fee2e3fd..0331f1461f81 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -33,7 +33,7 @@ config ZSWAP
pages that are in the process of being swapped out and attempts to
compress them into a dynamically allocated RAM-based memory pool.
This can result in a significant I/O reduction on swap device and,
- in the case where decompressing from RAM is faster that swap device
+ in the case where decompressing from RAM is faster than swap device
reads, can also improve workload performance.
This is marked experimental because it is a new feature (as of
@@ -639,14 +639,6 @@ config BOUNCE
memory available to the CPU. Enabled by default when HIGHMEM is
selected, but you may say n to override this.
-config VIRT_TO_BUS
- bool
- help
- An architecture should select this if it implements the
- deprecated interface virt_to_bus(). All new architectures
- should probably not select this.
-
-
config MMU_NOTIFIER
bool
select SRCU
@@ -663,7 +655,7 @@ config KSM
the many instances by a single page with that content, so
saving memory until one or another app needs to modify the content.
Recommended for use with KVM, or with other duplicative applications.
- See Documentation/vm/ksm.rst for more information: KSM is inactive
+ See Documentation/mm/ksm.rst for more information: KSM is inactive
until a program has madvised that an area is MADV_MERGEABLE, and
root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
@@ -951,9 +943,6 @@ config ARCH_HAS_CURRENT_STACK_POINTER
register alias named "current_stack_pointer", this config can be
selected.
-config ARCH_HAS_VM_GET_PAGE_PROT
- bool
-
config ARCH_HAS_PTE_DEVMAP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 6f9ffa968a1a..9a564f836403 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o
obj-$(CONFIG_IO_MAPPING) += io-mapping.o
obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
+obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index 4b8eab4b3f45..22c96fed70b5 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -228,10 +228,8 @@ static void balloon_page_putback(struct page *page)
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
}
-
/* move_to_new_page() counterpart for a ballooned page */
-static int balloon_page_migrate(struct address_space *mapping,
- struct page *newpage, struct page *page,
+static int balloon_page_migrate(struct page *newpage, struct page *page,
enum migrate_mode mode)
{
struct balloon_dev_info *balloon = balloon_page_device(page);
@@ -250,11 +248,11 @@ static int balloon_page_migrate(struct address_space *mapping,
return balloon->migratepage(balloon, newpage, page, mode);
}
-const struct address_space_operations balloon_aops = {
- .migratepage = balloon_page_migrate,
+const struct movable_operations balloon_mops = {
+ .migrate_page = balloon_page_migrate,
.isolate_page = balloon_page_isolate,
.putback_page = balloon_page_putback,
};
-EXPORT_SYMBOL_GPL(balloon_aops);
+EXPORT_SYMBOL_GPL(balloon_mops);
#endif /* CONFIG_BALLOON_COMPACTION */
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 2e7704955f4f..c3ffe253e055 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -163,7 +163,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
- char name[16];
+ char name[CMA_MAX_NAME];
scnprintf(name, sizeof(name), "cma-%s", cma->name);
diff --git a/mm/compaction.c b/mm/compaction.c
index 1f89b969c12b..640fa76228dd 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -110,28 +110,27 @@ static void split_map_pages(struct list_head *list)
}
#ifdef CONFIG_COMPACTION
-
-int PageMovable(struct page *page)
+bool PageMovable(struct page *page)
{
- struct address_space *mapping;
+ const struct movable_operations *mops;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (!__PageMovable(page))
- return 0;
+ return false;
- mapping = page_mapping(page);
- if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
- return 1;
+ mops = page_movable_ops(page);
+ if (mops)
+ return true;
- return 0;
+ return false;
}
EXPORT_SYMBOL(PageMovable);
-void __SetPageMovable(struct page *page, struct address_space *mapping)
+void __SetPageMovable(struct page *page, const struct movable_operations *mops)
{
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
- page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
+ VM_BUG_ON_PAGE((unsigned long)mops & PAGE_MAPPING_MOVABLE, page);
+ page->mapping = (void *)((unsigned long)mops | PAGE_MAPPING_MOVABLE);
}
EXPORT_SYMBOL(__SetPageMovable);
@@ -139,12 +138,10 @@ void __ClearPageMovable(struct page *page)
{
VM_BUG_ON_PAGE(!PageMovable(page), page);
/*
- * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
- * flag so that VM can catch up released page by driver after isolation.
- * With it, VM migration doesn't try to put it back.
+ * This page still has the type of a movable page, but it's
+ * actually not movable any more.
*/
- page->mapping = (void *)((unsigned long)page->mapping &
- PAGE_MAPPING_MOVABLE);
+ page->mapping = (void *)PAGE_MAPPING_MOVABLE;
}
EXPORT_SYMBOL(__ClearPageMovable);
@@ -616,6 +613,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
break;
set_page_private(page, order);
+ nr_scanned += isolated - 1;
total_isolated += isolated;
cc->nr_freepages += isolated;
list_add_tail(&page->lru, freelist);
@@ -1034,7 +1032,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
/*
* Only pages without mappings or that have a
- * ->migratepage callback are possible to migrate
+ * ->migrate_folio callback are possible to migrate
* without blocking. However, we can be racing with
* truncation so it's necessary to lock the page
* to stabilise the mapping as truncation holds
@@ -1045,7 +1043,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_fail_put;
mapping = page_mapping(page);
- migrate_dirty = !mapping || mapping->a_ops->migratepage;
+ migrate_dirty = !mapping ||
+ mapping->a_ops->migrate_folio;
unlock_page(page);
if (!migrate_dirty)
goto isolate_fail_put;
@@ -1101,6 +1100,7 @@ isolate_success:
isolate_success_no_list:
cc->nr_migratepages += compound_nr(page);
nr_isolated += compound_nr(page);
+ nr_scanned += compound_nr(page) - 1;
/*
* Avoid isolating too much unless this block is being
@@ -1504,6 +1504,7 @@ fast_isolate_freepages(struct compact_control *cc)
if (__isolate_free_page(page, order)) {
set_page_private(page, order);
nr_isolated = 1 << order;
+ nr_scanned += nr_isolated - 1;
cc->nr_freepages += nr_isolated;
list_add_tail(&page->lru, &cc->freepages);
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -3011,7 +3012,7 @@ void kcompactd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
void kcompactd_stop(int nid)
{
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig
index 9b559c76d6dd..66265e3a9c65 100644
--- a/mm/damon/Kconfig
+++ b/mm/damon/Kconfig
@@ -92,4 +92,12 @@ config DAMON_RECLAIM
reclamation under light memory pressure, while the traditional page
scanning-based reclamation is used for heavy pressure.
+config DAMON_LRU_SORT
+ bool "Build DAMON-based LRU-lists sorting (DAMON_LRU_SORT)"
+ depends on DAMON_PADDR
+ help
+ This builds the DAMON-based LRU-lists sorting subsystem. It tries to
+ protect frequently accessed (hot) pages while rarely accessed (cold)
+ pages reclaimed first under memory pressure.
+
endmenu
diff --git a/mm/damon/Makefile b/mm/damon/Makefile
index dbf7190b4144..3e6b8ad73858 100644
--- a/mm/damon/Makefile
+++ b/mm/damon/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o
obj-$(CONFIG_DAMON_SYSFS) += sysfs.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
+obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index a0dab8b5e45f..cb8a7e9926a4 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -97,6 +97,31 @@ out:
return ret;
}
+/*
+ * Return corresponding dbgfs' scheme action value (int) for the given
+ * damos_action if the given damos_action value is valid and supported by
+ * dbgfs, negative error code otherwise.
+ */
+static int damos_action_to_dbgfs_scheme_action(enum damos_action action)
+{
+ switch (action) {
+ case DAMOS_WILLNEED:
+ return 0;
+ case DAMOS_COLD:
+ return 1;
+ case DAMOS_PAGEOUT:
+ return 2;
+ case DAMOS_HUGEPAGE:
+ return 3;
+ case DAMOS_NOHUGEPAGE:
+ return 4;
+ case DAMOS_STAT:
+ return 5;
+ default:
+ return -EINVAL;
+ }
+}
+
static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
{
struct damos *s;
@@ -109,7 +134,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
s->min_sz_region, s->max_sz_region,
s->min_nr_accesses, s->max_nr_accesses,
s->min_age_region, s->max_age_region,
- s->action,
+ damos_action_to_dbgfs_scheme_action(s->action),
s->quota.ms, s->quota.sz,
s->quota.reset_interval,
s->quota.weight_sz,
@@ -160,18 +185,27 @@ static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
kfree(schemes);
}
-static bool damos_action_valid(int action)
+/*
+ * Return corresponding damos_action for the given dbgfs input for a scheme
+ * action if the input is valid, negative error code otherwise.
+ */
+static enum damos_action dbgfs_scheme_action_to_damos_action(int dbgfs_action)
{
- switch (action) {
- case DAMOS_WILLNEED:
- case DAMOS_COLD:
- case DAMOS_PAGEOUT:
- case DAMOS_HUGEPAGE:
- case DAMOS_NOHUGEPAGE:
- case DAMOS_STAT:
- return true;
+ switch (dbgfs_action) {
+ case 0:
+ return DAMOS_WILLNEED;
+ case 1:
+ return DAMOS_COLD;
+ case 2:
+ return DAMOS_PAGEOUT;
+ case 3:
+ return DAMOS_HUGEPAGE;
+ case 4:
+ return DAMOS_NOHUGEPAGE;
+ case 5:
+ return DAMOS_STAT;
default:
- return false;
+ return -EINVAL;
}
}
@@ -189,7 +223,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
int pos = 0, parsed, ret;
unsigned long min_sz, max_sz;
unsigned int min_nr_a, max_nr_a, min_age, max_age;
- unsigned int action;
+ unsigned int action_input;
+ enum damos_action action;
schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
GFP_KERNEL);
@@ -204,7 +239,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
ret = sscanf(&str[pos],
"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
&min_sz, &max_sz, &min_nr_a, &max_nr_a,
- &min_age, &max_age, &action, &quota.ms,
+ &min_age, &max_age, &action_input, &quota.ms,
&quota.sz, &quota.reset_interval,
&quota.weight_sz, &quota.weight_nr_accesses,
&quota.weight_age, &wmarks.metric,
@@ -212,7 +247,8 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
&wmarks.low, &parsed);
if (ret != 18)
break;
- if (!damos_action_valid(action))
+ action = dbgfs_scheme_action_to_damos_action(action_input);
+ if ((int)action < 0)
goto fail;
if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
@@ -275,11 +311,6 @@ out:
return ret;
}
-static inline bool target_has_pid(const struct damon_ctx *ctx)
-{
- return ctx->ops.id == DAMON_OPS_VADDR;
-}
-
static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
{
struct damon_target *t;
@@ -288,7 +319,7 @@ static ssize_t sprint_target_ids(struct damon_ctx *ctx, char *buf, ssize_t len)
int rc;
damon_for_each_target(t, ctx) {
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
/* Show pid numbers to debugfs users */
id = pid_vnr(t->pid);
else
@@ -415,7 +446,7 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
struct damon_target *t, *next;
damon_for_each_target_safe(t, next, ctx) {
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -425,11 +456,11 @@ static int dbgfs_set_targets(struct damon_ctx *ctx, ssize_t nr_targets,
if (!t) {
damon_for_each_target_safe(t, next, ctx)
damon_destroy_target(t);
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
dbgfs_put_pids(pids, nr_targets);
return -ENOMEM;
}
- if (target_has_pid(ctx))
+ if (damon_target_has_pid(ctx))
t->pid = pids[i];
damon_add_target(ctx, t);
}
@@ -722,7 +753,7 @@ static void dbgfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
- if (!target_has_pid(ctx))
+ if (!damon_target_has_pid(ctx))
return;
mutex_lock(&ctx->kdamond_lock);
diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c
new file mode 100644
index 000000000000..9de6f00a71c5
--- /dev/null
+++ b/mm/damon/lru_sort.c
@@ -0,0 +1,548 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * DAMON-based LRU-lists Sorting
+ *
+ * Author: SeongJae Park <sj@kernel.org>
+ */
+
+#define pr_fmt(fmt) "damon-lru-sort: " fmt
+
+#include <linux/damon.h>
+#include <linux/ioport.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "damon_lru_sort."
+
+/*
+ * Enable or disable DAMON_LRU_SORT.
+ *
+ * You can enable DAMON_LRU_SORT by setting the value of this parameter as
+ * ``Y``. Setting it as ``N`` disables DAMON_LRU_SORT. Note that
+ * DAMON_LRU_SORT could do no real monitoring and LRU-lists sorting due to the
+ * watermarks-based activation condition. Refer to below descriptions for the
+ * watermarks parameter for this.
+ */
+static bool enabled __read_mostly;
+
+/*
+ * Make DAMON_LRU_SORT reads the input parameters again, except ``enabled``.
+ *
+ * Input parameters that updated while DAMON_LRU_SORT is running are not
+ * applied by default. Once this parameter is set as ``Y``, DAMON_LRU_SORT
+ * reads values of parametrs except ``enabled`` again. Once the re-reading is
+ * done, this parameter is set as ``N``. If invalid parameters are found while
+ * the re-reading, DAMON_LRU_SORT will be disabled.
+ */
+static bool commit_inputs __read_mostly;
+module_param(commit_inputs, bool, 0600);
+
+/*
+ * Access frequency threshold for hot memory regions identification in permil.
+ *
+ * If a memory region is accessed in frequency of this or higher,
+ * DAMON_LRU_SORT identifies the region as hot, and mark it as accessed on the
+ * LRU list, so that it could not be reclaimed under memory pressure. 50% by
+ * default.
+ */
+static unsigned long hot_thres_access_freq = 500;
+module_param(hot_thres_access_freq, ulong, 0600);
+
+/*
+ * Time threshold for cold memory regions identification in microseconds.
+ *
+ * If a memory region is not accessed for this or longer time, DAMON_LRU_SORT
+ * identifies the region as cold, and mark it as unaccessed on the LRU list, so
+ * that it could be reclaimed first under memory pressure. 120 seconds by
+ * default.
+ */
+static unsigned long cold_min_age __read_mostly = 120000000;
+module_param(cold_min_age, ulong, 0600);
+
+/*
+ * Limit of time for trying the LRU lists sorting in milliseconds.
+ *
+ * DAMON_LRU_SORT tries to use only up to this time within a time window
+ * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used
+ * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the
+ * limit is disabled.
+ *
+ * 10 ms by default.
+ */
+static unsigned long quota_ms __read_mostly = 10;
+module_param(quota_ms, ulong, 0600);
+
+/*
+ * The time quota charge reset interval in milliseconds.
+ *
+ * The charge reset interval for the quota of time (quota_ms). That is,
+ * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms
+ * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds.
+ *
+ * 1 second by default.
+ */
+static unsigned long quota_reset_interval_ms __read_mostly = 1000;
+module_param(quota_reset_interval_ms, ulong, 0600);
+
+/*
+ * The watermarks check time interval in microseconds.
+ *
+ * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is
+ * enabled but inactive due to its watermarks rule. 5 seconds by default.
+ */
+static unsigned long wmarks_interval __read_mostly = 5000000;
+module_param(wmarks_interval, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the high watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is higher than
+ * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically
+ * checks the watermarks. 200 (20%) by default.
+ */
+static unsigned long wmarks_high __read_mostly = 200;
+module_param(wmarks_high, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the middle watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is between this and
+ * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring
+ * and the LRU-lists sorting. 150 (15%) by default.
+ */
+static unsigned long wmarks_mid __read_mostly = 150;
+module_param(wmarks_mid, ulong, 0600);
+
+/*
+ * Free memory rate (per thousand) for the low watermark.
+ *
+ * If free memory of the system in bytes per thousand bytes is lower than this,
+ * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks
+ * the watermarks. 50 (5%) by default.
+ */
+static unsigned long wmarks_low __read_mostly = 50;
+module_param(wmarks_low, ulong, 0600);
+
+/*
+ * Sampling interval for the monitoring in microseconds.
+ *
+ * The sampling interval of DAMON for the hot/cold memory monitoring. Please
+ * refer to the DAMON documentation for more detail. 5 ms by default.
+ */
+static unsigned long sample_interval __read_mostly = 5000;
+module_param(sample_interval, ulong, 0600);
+
+/*
+ * Aggregation interval for the monitoring in microseconds.
+ *
+ * The aggregation interval of DAMON for the hot/cold memory monitoring.
+ * Please refer to the DAMON documentation for more detail. 100 ms by default.
+ */
+static unsigned long aggr_interval __read_mostly = 100000;
+module_param(aggr_interval, ulong, 0600);
+
+/*
+ * Minimum number of monitoring regions.
+ *
+ * The minimal number of monitoring regions of DAMON for the hot/cold memory
+ * monitoring. This can be used to set lower-bound of the monitoring quality.
+ * But, setting this too high could result in increased monitoring overhead.
+ * Please refer to the DAMON documentation for more detail. 10 by default.
+ */
+static unsigned long min_nr_regions __read_mostly = 10;
+module_param(min_nr_regions, ulong, 0600);
+
+/*
+ * Maximum number of monitoring regions.
+ *
+ * The maximum number of monitoring regions of DAMON for the hot/cold memory
+ * monitoring. This can be used to set upper-bound of the monitoring overhead.
+ * However, setting this too low could result in bad monitoring quality.
+ * Please refer to the DAMON documentation for more detail. 1000 by default.
+ */
+static unsigned long max_nr_regions __read_mostly = 1000;
+module_param(max_nr_regions, ulong, 0600);
+
+/*
+ * Start of the target memory region in physical address.
+ *
+ * The start physical address of memory region that DAMON_LRU_SORT will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_start __read_mostly;
+module_param(monitor_region_start, ulong, 0600);
+
+/*
+ * End of the target memory region in physical address.
+ *
+ * The end physical address of memory region that DAMON_LRU_SORT will do work
+ * against. By default, biggest System RAM is used as the region.
+ */
+static unsigned long monitor_region_end __read_mostly;
+module_param(monitor_region_end, ulong, 0600);
+
+/*
+ * PID of the DAMON thread
+ *
+ * If DAMON_LRU_SORT is enabled, this becomes the PID of the worker thread.
+ * Else, -1.
+ */
+static int kdamond_pid __read_mostly = -1;
+module_param(kdamond_pid, int, 0400);
+
+/*
+ * Number of hot memory regions that tried to be LRU-sorted.
+ */
+static unsigned long nr_lru_sort_tried_hot_regions __read_mostly;
+module_param(nr_lru_sort_tried_hot_regions, ulong, 0400);
+
+/*
+ * Total bytes of hot memory regions that tried to be LRU-sorted.
+ */
+static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly;
+module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400);
+
+/*
+ * Number of hot memory regions that successfully be LRU-sorted.
+ */
+static unsigned long nr_lru_sorted_hot_regions __read_mostly;
+module_param(nr_lru_sorted_hot_regions, ulong, 0400);
+
+/*
+ * Total bytes of hot memory regions that successfully be LRU-sorted.
+ */
+static unsigned long bytes_lru_sorted_hot_regions __read_mostly;
+module_param(bytes_lru_sorted_hot_regions, ulong, 0400);
+
+/*
+ * Number of times that the time quota limit for hot regions have exceeded
+ */
+static unsigned long nr_hot_quota_exceeds __read_mostly;
+module_param(nr_hot_quota_exceeds, ulong, 0400);
+
+/*
+ * Number of cold memory regions that tried to be LRU-sorted.
+ */
+static unsigned long nr_lru_sort_tried_cold_regions __read_mostly;
+module_param(nr_lru_sort_tried_cold_regions, ulong, 0400);
+
+/*
+ * Total bytes of cold memory regions that tried to be LRU-sorted.
+ */
+static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly;
+module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400);
+
+/*
+ * Number of cold memory regions that successfully be LRU-sorted.
+ */
+static unsigned long nr_lru_sorted_cold_regions __read_mostly;
+module_param(nr_lru_sorted_cold_regions, ulong, 0400);
+
+/*
+ * Total bytes of cold memory regions that successfully be LRU-sorted.
+ */
+static unsigned long bytes_lru_sorted_cold_regions __read_mostly;
+module_param(bytes_lru_sorted_cold_regions, ulong, 0400);
+
+/*
+ * Number of times that the time quota limit for cold regions have exceeded
+ */
+static unsigned long nr_cold_quota_exceeds __read_mostly;
+module_param(nr_cold_quota_exceeds, ulong, 0400);
+
+static struct damon_ctx *ctx;
+static struct damon_target *target;
+
+struct damon_lru_sort_ram_walk_arg {
+ unsigned long start;
+ unsigned long end;
+};
+
+static int walk_system_ram(struct resource *res, void *arg)
+{
+ struct damon_lru_sort_ram_walk_arg *a = arg;
+
+ if (a->end - a->start < resource_size(res)) {
+ a->start = res->start;
+ a->end = res->end;
+ }
+ return 0;
+}
+
+/*
+ * Find biggest 'System RAM' resource and store its start and end address in
+ * @start and @end, respectively. If no System RAM is found, returns false.
+ */
+static bool get_monitoring_region(unsigned long *start, unsigned long *end)
+{
+ struct damon_lru_sort_ram_walk_arg arg = {};
+
+ walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram);
+ if (arg.end <= arg.start)
+ return false;
+
+ *start = arg.start;
+ *end = arg.end;
+ return true;
+}
+
+/* Create a DAMON-based operation scheme for hot memory regions */
+static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres)
+{
+ struct damos_watermarks wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = wmarks_interval,
+ .high = wmarks_high,
+ .mid = wmarks_mid,
+ .low = wmarks_low,
+ };
+ struct damos_quota quota = {
+ /*
+ * Do not try LRU-lists sorting of hot pages for more than half
+ * of quota_ms milliseconds within quota_reset_interval_ms.
+ */
+ .ms = quota_ms / 2,
+ .sz = 0,
+ .reset_interval = quota_reset_interval_ms,
+ /* Within the quota, mark hotter regions accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 1,
+ .weight_age = 0,
+ };
+ struct damos *scheme = damon_new_scheme(
+ /* Find regions having PAGE_SIZE or larger size */
+ PAGE_SIZE, ULONG_MAX,
+ /* and accessed for more than the threshold */
+ hot_thres, UINT_MAX,
+ /* no matter its age */
+ 0, UINT_MAX,
+ /* prioritize those on LRU lists, as soon as found */
+ DAMOS_LRU_PRIO,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &wmarks);
+
+ return scheme;
+}
+
+/* Create a DAMON-based operation scheme for cold memory regions */
+static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres)
+{
+ struct damos_watermarks wmarks = {
+ .metric = DAMOS_WMARK_FREE_MEM_RATE,
+ .interval = wmarks_interval,
+ .high = wmarks_high,
+ .mid = wmarks_mid,
+ .low = wmarks_low,
+ };
+ struct damos_quota quota = {
+ /*
+ * Do not try LRU-lists sorting of cold pages for more than
+ * half of quota_ms milliseconds within
+ * quota_reset_interval_ms.
+ */
+ .ms = quota_ms / 2,
+ .sz = 0,
+ .reset_interval = quota_reset_interval_ms,
+ /* Within the quota, mark colder regions not accessed first. */
+ .weight_sz = 0,
+ .weight_nr_accesses = 0,
+ .weight_age = 1,
+ };
+ struct damos *scheme = damon_new_scheme(
+ /* Find regions having PAGE_SIZE or larger size */
+ PAGE_SIZE, ULONG_MAX,
+ /* and not accessed at all */
+ 0, 0,
+ /* for cold_thres or more micro-seconds, and */
+ cold_thres, UINT_MAX,
+ /* mark those as not accessed, as soon as found */
+ DAMOS_LRU_DEPRIO,
+ /* under the quota. */
+ &quota,
+ /* (De)activate this according to the watermarks. */
+ &wmarks);
+
+ return scheme;
+}
+
+static int damon_lru_sort_apply_parameters(void)
+{
+ struct damos *scheme, *next_scheme;
+ struct damon_addr_range addr_range;
+ unsigned int hot_thres, cold_thres;
+ int err = 0;
+
+ err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0,
+ min_nr_regions, max_nr_regions);
+ if (err)
+ return err;
+
+ /* free previously set schemes */
+ damon_for_each_scheme_safe(scheme, next_scheme, ctx)
+ damon_destroy_scheme(scheme);
+
+ /* aggr_interval / sample_interval is the maximum nr_accesses */
+ hot_thres = aggr_interval / sample_interval * hot_thres_access_freq /
+ 1000;
+ scheme = damon_lru_sort_new_hot_scheme(hot_thres);
+ if (!scheme)
+ return -ENOMEM;
+ damon_add_scheme(ctx, scheme);
+
+ cold_thres = cold_min_age / aggr_interval;
+ scheme = damon_lru_sort_new_cold_scheme(cold_thres);
+ if (!scheme)
+ return -ENOMEM;
+ damon_add_scheme(ctx, scheme);
+
+ if (monitor_region_start > monitor_region_end)
+ return -EINVAL;
+ if (!monitor_region_start && !monitor_region_end &&
+ !get_monitoring_region(&monitor_region_start,
+ &monitor_region_end))
+ return -EINVAL;
+ addr_range.start = monitor_region_start;
+ addr_range.end = monitor_region_end;
+ return damon_set_regions(target, &addr_range, 1);
+}
+
+static int damon_lru_sort_turn(bool on)
+{
+ int err;
+
+ if (!on) {
+ err = damon_stop(&ctx, 1);
+ if (!err)
+ kdamond_pid = -1;
+ return err;
+ }
+
+ err = damon_lru_sort_apply_parameters();
+ if (err)
+ return err;
+
+ err = damon_start(&ctx, 1, true);
+ if (err)
+ return err;
+ kdamond_pid = ctx->kdamond->pid;
+ return 0;
+}
+
+static struct delayed_work damon_lru_sort_timer;
+static void damon_lru_sort_timer_fn(struct work_struct *work)
+{
+ static bool last_enabled;
+ bool now_enabled;
+
+ now_enabled = enabled;
+ if (last_enabled != now_enabled) {
+ if (!damon_lru_sort_turn(now_enabled))
+ last_enabled = now_enabled;
+ else
+ enabled = last_enabled;
+ }
+}
+static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn);
+
+static bool damon_lru_sort_initialized;
+
+static int damon_lru_sort_enabled_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+
+ if (!damon_lru_sort_initialized)
+ return rc;
+
+ schedule_delayed_work(&damon_lru_sort_timer, 0);
+
+ return 0;
+}
+
+static const struct kernel_param_ops enabled_param_ops = {
+ .set = damon_lru_sort_enabled_store,
+ .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
+MODULE_PARM_DESC(enabled,
+ "Enable or disable DAMON_LRU_SORT (default: disabled)");
+
+static int damon_lru_sort_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_lru_sort_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
+static int damon_lru_sort_after_aggregation(struct damon_ctx *c)
+{
+ struct damos *s;
+
+ /* update the stats parameter */
+ damon_for_each_scheme(s, c) {
+ if (s->action == DAMOS_LRU_PRIO) {
+ nr_lru_sort_tried_hot_regions = s->stat.nr_tried;
+ bytes_lru_sort_tried_hot_regions = s->stat.sz_tried;
+ nr_lru_sorted_hot_regions = s->stat.nr_applied;
+ bytes_lru_sorted_hot_regions = s->stat.sz_applied;
+ nr_hot_quota_exceeds = s->stat.qt_exceeds;
+ } else if (s->action == DAMOS_LRU_DEPRIO) {
+ nr_lru_sort_tried_cold_regions = s->stat.nr_tried;
+ bytes_lru_sort_tried_cold_regions = s->stat.sz_tried;
+ nr_lru_sorted_cold_regions = s->stat.nr_applied;
+ bytes_lru_sorted_cold_regions = s->stat.sz_applied;
+ nr_cold_quota_exceeds = s->stat.qt_exceeds;
+ }
+ }
+
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c)
+{
+ return damon_lru_sort_handle_commit_inputs();
+}
+
+static int __init damon_lru_sort_init(void)
+{
+ ctx = damon_new_ctx();
+ if (!ctx)
+ return -ENOMEM;
+
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ return -EINVAL;
+ }
+
+ ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check;
+ ctx->callback.after_aggregation = damon_lru_sort_after_aggregation;
+
+ target = damon_new_target();
+ if (!target) {
+ damon_destroy_ctx(ctx);
+ return -ENOMEM;
+ }
+ damon_add_target(ctx, target);
+
+ schedule_delayed_work(&damon_lru_sort_timer, 0);
+
+ damon_lru_sort_initialized = true;
+ return 0;
+}
+
+module_init(damon_lru_sort_init);
diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c
index 10ef20b2003f..b1335de200e7 100644
--- a/mm/damon/ops-common.c
+++ b/mm/damon/ops-common.c
@@ -130,3 +130,45 @@ int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
/* Return coldness of the region */
return DAMOS_MAX_SCORE - hotness;
}
+
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s)
+{
+ unsigned int max_nr_accesses;
+ int freq_subscore;
+ unsigned int age_in_sec;
+ int age_in_log, age_subscore;
+ unsigned int freq_weight = s->quota.weight_nr_accesses;
+ unsigned int age_weight = s->quota.weight_age;
+ int hotness;
+
+ max_nr_accesses = c->aggr_interval / c->sample_interval;
+ freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses;
+
+ age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000;
+ for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec;
+ age_in_log++, age_in_sec >>= 1)
+ ;
+
+ /* If frequency is 0, higher age means it's colder */
+ if (freq_subscore == 0)
+ age_in_log *= -1;
+
+ /*
+ * Now age_in_log is in [-DAMON_MAX_AGE_IN_LOG, DAMON_MAX_AGE_IN_LOG].
+ * Scale it to be in [0, 100] and set it as age subscore.
+ */
+ age_in_log += DAMON_MAX_AGE_IN_LOG;
+ age_subscore = age_in_log * DAMON_MAX_SUBSCORE /
+ DAMON_MAX_AGE_IN_LOG / 2;
+
+ hotness = (freq_weight * freq_subscore + age_weight * age_subscore);
+ if (freq_weight + age_weight)
+ hotness /= freq_weight + age_weight;
+ /*
+ * Transform it to fit in [0, DAMOS_MAX_SCORE]
+ */
+ hotness = hotness * DAMOS_MAX_SCORE / DAMON_MAX_SUBSCORE;
+
+ return hotness;
+}
diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h
index e790cb5f8fe0..52329ff361cd 100644
--- a/mm/damon/ops-common.h
+++ b/mm/damon/ops-common.h
@@ -14,3 +14,5 @@ void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr);
int damon_pageout_score(struct damon_ctx *c, struct damon_region *r,
struct damos *s);
+int damon_hot_score(struct damon_ctx *c, struct damon_region *r,
+ struct damos *s);
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index b40ff5811bb2..dc131c6a5403 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -204,16 +204,11 @@ static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
return max_nr_accesses;
}
-static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
- struct damon_target *t, struct damon_region *r,
- struct damos *scheme)
+static unsigned long damon_pa_pageout(struct damon_region *r)
{
unsigned long addr, applied;
LIST_HEAD(page_list);
- if (scheme->action != DAMOS_PAGEOUT)
- return 0;
-
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
struct page *page = damon_get_page(PHYS_PFN(addr));
@@ -238,6 +233,55 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
return applied * PAGE_SIZE;
}
+static unsigned long damon_pa_mark_accessed(struct damon_region *r)
+{
+ unsigned long addr, applied = 0;
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct page *page = damon_get_page(PHYS_PFN(addr));
+
+ if (!page)
+ continue;
+ mark_page_accessed(page);
+ put_page(page);
+ applied++;
+ }
+ return applied * PAGE_SIZE;
+}
+
+static unsigned long damon_pa_deactivate_pages(struct damon_region *r)
+{
+ unsigned long addr, applied = 0;
+
+ for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
+ struct page *page = damon_get_page(PHYS_PFN(addr));
+
+ if (!page)
+ continue;
+ deactivate_page(page);
+ put_page(page);
+ applied++;
+ }
+ return applied * PAGE_SIZE;
+}
+
+static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
+ struct damon_target *t, struct damon_region *r,
+ struct damos *scheme)
+{
+ switch (scheme->action) {
+ case DAMOS_PAGEOUT:
+ return damon_pa_pageout(r);
+ case DAMOS_LRU_PRIO:
+ return damon_pa_mark_accessed(r);
+ case DAMOS_LRU_DEPRIO:
+ return damon_pa_deactivate_pages(r);
+ default:
+ break;
+ }
+ return 0;
+}
+
static int damon_pa_scheme_score(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme)
@@ -245,6 +289,10 @@ static int damon_pa_scheme_score(struct damon_ctx *context,
switch (scheme->action) {
case DAMOS_PAGEOUT:
return damon_pageout_score(context, r, scheme);
+ case DAMOS_LRU_PRIO:
+ return damon_hot_score(context, r, scheme);
+ case DAMOS_LRU_DEPRIO:
+ return damon_pageout_score(context, r, scheme);
default:
break;
}
diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c
index 8efbfb24f3a1..a7faf51b4bd4 100644
--- a/mm/damon/reclaim.c
+++ b/mm/damon/reclaim.c
@@ -353,7 +353,6 @@ static int damon_reclaim_turn(bool on)
return 0;
}
-#define ENABLE_CHECK_INTERVAL_MS 1000
static struct delayed_work damon_reclaim_timer;
static void damon_reclaim_timer_fn(struct work_struct *work)
{
@@ -367,14 +366,12 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
else
enabled = last_enabled;
}
-
- if (enabled)
- schedule_delayed_work(&damon_reclaim_timer,
- msecs_to_jiffies(ENABLE_CHECK_INTERVAL_MS));
}
static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
-static int enabled_store(const char *val,
+static bool damon_reclaim_initialized;
+
+static int damon_reclaim_enabled_store(const char *val,
const struct kernel_param *kp)
{
int rc = param_set_bool(val, kp);
@@ -382,14 +379,16 @@ static int enabled_store(const char *val,
if (rc < 0)
return rc;
- if (enabled)
- schedule_delayed_work(&damon_reclaim_timer, 0);
+ /* system_wq might not initialized yet */
+ if (!damon_reclaim_initialized)
+ return rc;
+ schedule_delayed_work(&damon_reclaim_timer, 0);
return 0;
}
static const struct kernel_param_ops enabled_param_ops = {
- .set = enabled_store,
+ .set = damon_reclaim_enabled_store,
.get = param_get_bool,
};
@@ -397,10 +396,21 @@ module_param_cb(enabled, &enabled_param_ops, &enabled, 0600);
MODULE_PARM_DESC(enabled,
"Enable or disable DAMON_RECLAIM (default: disabled)");
+static int damon_reclaim_handle_commit_inputs(void)
+{
+ int err;
+
+ if (!commit_inputs)
+ return 0;
+
+ err = damon_reclaim_apply_parameters();
+ commit_inputs = false;
+ return err;
+}
+
static int damon_reclaim_after_aggregation(struct damon_ctx *c)
{
struct damos *s;
- int err = 0;
/* update the stats parameter */
damon_for_each_scheme(s, c) {
@@ -411,22 +421,12 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c)
nr_quota_exceeds = s->stat.qt_exceeds;
}
- if (commit_inputs) {
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- }
- return err;
+ return damon_reclaim_handle_commit_inputs();
}
static int damon_reclaim_after_wmarks_check(struct damon_ctx *c)
{
- int err = 0;
-
- if (commit_inputs) {
- err = damon_reclaim_apply_parameters();
- commit_inputs = false;
- }
- return err;
+ return damon_reclaim_handle_commit_inputs();
}
static int __init damon_reclaim_init(void)
@@ -435,8 +435,10 @@ static int __init damon_reclaim_init(void)
if (!ctx)
return -ENOMEM;
- if (damon_select_ops(ctx, DAMON_OPS_PADDR))
+ if (damon_select_ops(ctx, DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
return -EINVAL;
+ }
ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check;
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
@@ -449,6 +451,8 @@ static int __init damon_reclaim_init(void)
damon_add_target(ctx, target);
schedule_delayed_work(&damon_reclaim_timer, 0);
+
+ damon_reclaim_initialized = true;
return 0;
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index 09f9e8ca3d1f..7488e27c87c3 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -762,6 +762,8 @@ static const char * const damon_sysfs_damos_action_strs[] = {
"pageout",
"hugepage",
"nohugepage",
+ "lru_prio",
+ "lru_deprio",
"stat",
};
@@ -2136,8 +2138,7 @@ static void damon_sysfs_destroy_targets(struct damon_ctx *ctx)
struct damon_target *t, *next;
damon_for_each_target_safe(t, next, ctx) {
- if (ctx->ops.id == DAMON_OPS_VADDR ||
- ctx->ops.id == DAMON_OPS_FVADDR)
+ if (damon_target_has_pid(ctx))
put_pid(t->pid);
damon_destroy_target(t);
}
@@ -2181,8 +2182,7 @@ static int damon_sysfs_add_target(struct damon_sysfs_target *sys_target,
if (!t)
return -ENOMEM;
- if (ctx->ops.id == DAMON_OPS_VADDR ||
- ctx->ops.id == DAMON_OPS_FVADDR) {
+ if (damon_target_has_pid(ctx)) {
t->pid = find_get_pid(sys_target->pid);
if (!t->pid)
goto destroy_targets_out;
@@ -2210,7 +2210,7 @@ static struct damon_target *damon_sysfs_existing_target(
struct pid *pid;
struct damon_target *t;
- if (ctx->ops.id == DAMON_OPS_PADDR) {
+ if (!damon_target_has_pid(ctx)) {
/* Up to only one target for paddr could exist */
damon_for_each_target(t, ctx)
return t;
@@ -2359,6 +2359,23 @@ static inline bool damon_sysfs_kdamond_running(
damon_sysfs_ctx_running(kdamond->damon_ctx);
}
+static int damon_sysfs_apply_inputs(struct damon_ctx *ctx,
+ struct damon_sysfs_context *sys_ctx)
+{
+ int err;
+
+ err = damon_select_ops(ctx, sys_ctx->ops_id);
+ if (err)
+ return err;
+ err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
+ if (err)
+ return err;
+ err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
+ if (err)
+ return err;
+ return damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
+}
+
/*
* damon_sysfs_commit_input() - Commit user inputs to a running kdamond.
* @kdamond: The kobject wrapper for the associated kdamond.
@@ -2367,31 +2384,14 @@ static inline bool damon_sysfs_kdamond_running(
*/
static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond)
{
- struct damon_ctx *ctx = kdamond->damon_ctx;
- struct damon_sysfs_context *sys_ctx;
- int err = 0;
-
if (!damon_sysfs_kdamond_running(kdamond))
return -EINVAL;
/* TODO: Support multiple contexts per kdamond */
if (kdamond->contexts->nr != 1)
return -EINVAL;
- sys_ctx = kdamond->contexts->contexts_arr[0];
-
- err = damon_select_ops(ctx, sys_ctx->ops_id);
- if (err)
- return err;
- err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
- if (err)
- return err;
- err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
- if (err)
- return err;
- err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
- if (err)
- return err;
- return err;
+ return damon_sysfs_apply_inputs(kdamond->damon_ctx,
+ kdamond->contexts->contexts_arr[0]);
}
/*
@@ -2438,27 +2438,16 @@ static struct damon_ctx *damon_sysfs_build_ctx(
if (!ctx)
return ERR_PTR(-ENOMEM);
- err = damon_select_ops(ctx, sys_ctx->ops_id);
- if (err)
- goto out;
- err = damon_sysfs_set_attrs(ctx, sys_ctx->attrs);
- if (err)
- goto out;
- err = damon_sysfs_set_targets(ctx, sys_ctx->targets);
- if (err)
- goto out;
- err = damon_sysfs_set_schemes(ctx, sys_ctx->schemes);
- if (err)
- goto out;
+ err = damon_sysfs_apply_inputs(ctx, sys_ctx);
+ if (err) {
+ damon_destroy_ctx(ctx);
+ return ERR_PTR(err);
+ }
ctx->callback.after_wmarks_check = damon_sysfs_cmd_request_callback;
ctx->callback.after_aggregation = damon_sysfs_cmd_request_callback;
ctx->callback.before_terminate = damon_sysfs_before_terminate;
return ctx;
-
-out:
- damon_destroy_ctx(ctx);
- return ERR_PTR(err);
}
static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index 59e1653799f8..3c7b9d6dca95 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -336,8 +336,7 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
if (pte_young(entry)) {
referenced = true;
entry = pte_mkold(entry);
- huge_ptep_set_access_flags(vma, addr, pte, entry,
- vma->vm_flags & VM_WRITE);
+ set_huge_pte_at(mm, addr, pte, entry);
}
#ifdef CONFIG_MMU_NOTIFIER
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 1ab091f49fc0..dc7df1254f0a 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -35,7 +35,7 @@
#include <asm/tlbflush.h>
/*
- * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics
* expectations that are being validated here. All future changes in here
* or the documentation need to be in sync.
*/
diff --git a/mm/filemap.c b/mm/filemap.c
index ac3775c1ce4c..15800334147b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -667,7 +667,7 @@ EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
{
- int err = 0;
+ int err = 0, err2;
if (mapping_needs_writeback(mapping)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
@@ -678,18 +678,12 @@ int filemap_write_and_wait_range(struct address_space *mapping,
* But the -EIO is special case, it may indicate the worst
* thing (e.g. bug) happened, so we avoid waiting for it.
*/
- if (err != -EIO) {
- int err2 = filemap_fdatawait_range(mapping,
- lstart, lend);
- if (!err)
- err = err2;
- } else {
- /* Clear any previously stored errors */
- filemap_check_errors(mapping);
- }
- } else {
- err = filemap_check_errors(mapping);
+ if (err != -EIO)
+ __filemap_fdatawait_range(mapping, lstart, lend);
}
+ err2 = filemap_check_errors(mapping);
+ if (!err)
+ err = err2;
return err;
}
EXPORT_SYMBOL(filemap_write_and_wait_range);
@@ -929,26 +923,6 @@ error:
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
-/**
- * add_to_page_cache_locked - add a locked page to the pagecache
- * @page: page to add
- * @mapping: the page's address_space
- * @offset: page index
- * @gfp_mask: page allocation mode
- *
- * This function is used to add a page to the pagecache. It must be locked.
- * This function does not add the page to the LRU. The caller must do that.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask)
-{
- return __filemap_add_folio(mapping, page_folio(page), offset,
- gfp_mask, NULL);
-}
-EXPORT_SYMBOL(add_to_page_cache_locked);
-
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
pgoff_t index, gfp_t gfp)
{
@@ -1988,6 +1962,10 @@ no_page:
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
gfp &= ~__GFP_FS;
+ if (fgp_flags & FGP_NOWAIT) {
+ gfp &= ~GFP_KERNEL;
+ gfp |= GFP_NOWAIT | __GFP_NOWARN;
+ }
folio = filemap_alloc_folio(gfp, 0);
if (!folio)
@@ -2147,65 +2125,46 @@ put:
return folio_batch_count(fbatch);
}
-static inline
-bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
-{
- if (!folio_test_large(folio) || folio_test_hugetlb(folio))
- return false;
- if (index >= max)
- return false;
- return index < folio->index + folio_nr_pages(folio) - 1;
-}
-
/**
- * find_get_pages_range - gang pagecache lookup
+ * filemap_get_folios - Get a batch of folios
* @mapping: The address_space to search
* @start: The starting page index
* @end: The final page index (inclusive)
- * @nr_pages: The maximum number of pages
- * @pages: Where the resulting pages are placed
+ * @fbatch: The batch to fill.
*
- * find_get_pages_range() will search for and return a group of up to @nr_pages
- * pages in the mapping starting at index @start and up to index @end
- * (inclusive). The pages are placed at @pages. find_get_pages_range() takes
- * a reference against the returned pages.
+ * Search for and return a batch of folios in the mapping starting at
+ * index @start and up to index @end (inclusive). The folios are returned
+ * in @fbatch with an elevated reference count.
*
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes. There may be holes in the indices due to not-present pages.
- * We also update @start to index the next page for the traversal.
+ * The first folio may start before @start; if it does, it will contain
+ * @start. The final folio may extend beyond @end; if it does, it will
+ * contain @end. The folios have ascending indices. There may be gaps
+ * between the folios if there are indices which have no folio in the
+ * page cache. If folios are added to or removed from the page cache
+ * while this is running, they may or may not be found by this call.
*
- * Return: the number of pages which were found. If this number is
- * smaller than @nr_pages, the end of specified range has been
- * reached.
+ * Return: The number of folios which were found.
+ * We also update @start to index the next folio for the traversal.
*/
-unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
- pgoff_t end, unsigned int nr_pages,
- struct page **pages)
+unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch)
{
XA_STATE(xas, &mapping->i_pages, *start);
struct folio *folio;
- unsigned ret = 0;
-
- if (unlikely(!nr_pages))
- return 0;
rcu_read_lock();
- while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
/* Skip over shadow, swap and DAX entries */
if (xa_is_value(folio))
continue;
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
-again:
- pages[ret] = folio_file_page(folio, xas.xa_index);
- if (++ret == nr_pages) {
- *start = xas.xa_index + 1;
+ if (folio_test_hugetlb(folio))
+ nr = 1;
+ *start = folio->index + nr;
goto out;
}
- if (folio_more_pages(folio, xas.xa_index, end)) {
- xas.xa_index++;
- folio_ref_inc(folio);
- goto again;
- }
}
/*
@@ -2221,7 +2180,18 @@ again:
out:
rcu_read_unlock();
- return ret;
+ return folio_batch_count(fbatch);
+}
+EXPORT_SYMBOL(filemap_get_folios);
+
+static inline
+bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
+{
+ if (!folio_test_large(folio) || folio_test_hugetlb(folio))
+ return false;
+ if (index >= max)
+ return false;
+ return index < folio->index + folio_nr_pages(folio) - 1;
}
/**
@@ -2385,6 +2355,8 @@ static void filemap_get_read_batch(struct address_space *mapping,
continue;
if (xas.xa_index > max || xa_is_value(folio))
break;
+ if (xa_is_sibling(folio))
+ break;
if (!folio_try_get_rcu(folio))
goto retry;
@@ -2407,7 +2379,7 @@ retry:
rcu_read_unlock();
}
-static int filemap_read_folio(struct file *file, struct address_space *mapping,
+static int filemap_read_folio(struct file *file, filler_t filler,
struct folio *folio)
{
int error;
@@ -2419,7 +2391,7 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping,
*/
folio_clear_error(folio);
/* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->read_folio(file, folio);
+ error = filler(file, folio);
if (error)
return error;
@@ -2428,7 +2400,8 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping,
return error;
if (folio_test_uptodate(folio))
return 0;
- shrink_readahead_size_eio(&file->f_ra);
+ if (file)
+ shrink_readahead_size_eio(&file->f_ra);
return -EIO;
}
@@ -2501,7 +2474,8 @@ static int filemap_update_page(struct kiocb *iocb,
if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
goto unlock;
- error = filemap_read_folio(iocb->ki_filp, mapping, folio);
+ error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
+ folio);
goto unlock_mapping;
unlock:
folio_unlock(folio);
@@ -2544,7 +2518,7 @@ static int filemap_create_folio(struct file *file,
if (error)
goto error;
- error = filemap_read_folio(file, mapping, folio);
+ error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
if (error)
goto error;
@@ -2629,6 +2603,13 @@ err:
return err;
}
+static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
+{
+ unsigned int shift = folio_shift(folio);
+
+ return (pos1 >> shift == pos2 >> shift);
+}
+
/**
* filemap_read - Read data from the page cache.
* @iocb: The iocb to read.
@@ -2700,11 +2681,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
writably_mapped = mapping_writably_mapped(mapping);
/*
- * When a sequential read accesses a page several times, only
+ * When a read accesses the same folio several times, only
* mark it as accessed the first time.
*/
- if (iocb->ki_pos >> PAGE_SHIFT !=
- ra->prev_pos >> PAGE_SHIFT)
+ if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1,
+ fbatch.folios[0]))
folio_mark_accessed(fbatch.folios[0]);
for (i = 0; i < folio_batch_count(&fbatch); i++) {
@@ -3221,7 +3202,7 @@ page_not_uptodate:
* and we need to check for errors.
*/
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
- error = filemap_read_folio(file, mapping, folio);
+ error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
if (fpin)
goto out_retry;
folio_put(folio);
@@ -3511,20 +3492,7 @@ repeat:
return ERR_PTR(err);
}
-filler:
- err = filler(file, folio);
- if (err < 0) {
- folio_put(folio);
- return ERR_PTR(err);
- }
-
- folio_wait_locked(folio);
- if (!folio_test_uptodate(folio)) {
- folio_put(folio);
- return ERR_PTR(-EIO);
- }
-
- goto out;
+ goto filler;
}
if (folio_test_uptodate(folio))
goto out;
@@ -3547,14 +3515,14 @@ filler:
goto out;
}
- /*
- * A previous I/O error may have been due to temporary
- * failures.
- * Clear page error before actual read, PG_error will be
- * set again if read page fails.
- */
- folio_clear_error(folio);
- goto filler;
+filler:
+ err = filemap_read_folio(file, filler, folio);
+ if (err) {
+ folio_put(folio);
+ if (err == AOP_TRUNCATED_PAGE)
+ goto repeat;
+ return ERR_PTR(err);
+ }
out:
folio_mark_accessed(folio);
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 20bc15b57d93..458618c7302c 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -51,28 +51,6 @@ void mark_page_accessed(struct page *page)
}
EXPORT_SYMBOL(mark_page_accessed);
-#ifdef CONFIG_MIGRATION
-int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page, int extra_count)
-{
- return folio_migrate_mapping(mapping, page_folio(newpage),
- page_folio(page), extra_count);
-}
-EXPORT_SYMBOL(migrate_page_move_mapping);
-
-void migrate_page_states(struct page *newpage, struct page *page)
-{
- folio_migrate_flags(page_folio(newpage), page_folio(page));
-}
-EXPORT_SYMBOL(migrate_page_states);
-
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
- folio_migrate_copy(page_folio(newpage), page_folio(page));
-}
-EXPORT_SYMBOL(migrate_page_copy);
-#endif
-
bool set_page_writeback(struct page *page)
{
return folio_start_writeback(page_folio(page));
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 6f69b044a8cc..1a97610308cb 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -4,7 +4,7 @@
*
* This code provides the generic "frontend" layer to call a matching
* "backend" driver implementation of frontswap. See
- * Documentation/vm/frontswap.rst for more information.
+ * Documentation/mm/frontswap.rst for more information.
*
* Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
diff --git a/mm/gup.c b/mm/gup.c
index 551264407624..732825157430 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -87,7 +87,8 @@ retry:
* belongs to this folio.
*/
if (unlikely(page_folio(page) != folio)) {
- folio_put_refs(folio, refs);
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
goto retry;
}
@@ -133,7 +134,7 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
* path.
*/
if (unlikely((flags & FOLL_LONGTERM) &&
- !is_pinnable_page(page)))
+ !is_longterm_pinnable_page(page)))
return NULL;
/*
@@ -176,7 +177,8 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
refs *= GUP_PIN_COUNTING_BIAS;
}
- folio_put_refs(folio, refs);
+ if (!put_devmap_managed_page_refs(&folio->page, refs))
+ folio_put_refs(folio, refs);
}
/**
@@ -951,6 +953,25 @@ static int faultin_page(struct vm_area_struct *vma,
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
+ * mmap lock in the page fault handler. Sanity check this.
+ */
+ WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
+ if (locked)
+ *locked = 0;
+ /*
+ * We should do the same as VM_FAULT_RETRY, but let's not
+ * return -EBUSY since that's not reflecting the reality of
+ * what has happened - we've just fully completed a page
+ * fault, with the mmap lock released. Use -EAGAIN to show
+ * that we want to take the mmap lock _again_.
+ */
+ return -EAGAIN;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -1177,6 +1198,7 @@ retry:
case 0:
goto retry;
case -EBUSY:
+ case -EAGAIN:
ret = 0;
fallthrough;
case -EFAULT:
@@ -1303,6 +1325,18 @@ retry:
return -EINTR;
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+
+ if (ret & VM_FAULT_COMPLETED) {
+ /*
+ * NOTE: it's a pity that we need to retake the lock here
+ * to pair with the unlock() in the callers. Ideally we
+ * could tell the callers so they do not need to unlock.
+ */
+ mmap_read_lock(mm);
+ *unlocked = true;
+ return 0;
+ }
+
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1368,7 +1402,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
- /* VM_FAULT_RETRY cannot return errors */
+ /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
if (!*locked) {
BUG_ON(ret < 0);
BUG_ON(ret >= nr_pages);
@@ -1672,7 +1706,7 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
goto finish_or_fault;
if (pages) {
- pages[i] = virt_to_page(start);
+ pages[i] = virt_to_page((void *)start);
if (pages[i])
get_page(pages[i]);
}
@@ -1881,7 +1915,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
unsigned long isolation_error_count = 0, i;
struct folio *prev_folio = NULL;
LIST_HEAD(movable_page_list);
- bool drain_allow = true;
+ bool drain_allow = true, coherent_pages = false;
int ret = 0;
for (i = 0; i < nr_pages; i++) {
@@ -1891,14 +1925,43 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
continue;
prev_folio = folio;
- if (folio_is_pinnable(folio))
+ /*
+ * Device coherent pages are managed by a driver and should not
+ * be pinned indefinitely as it prevents the driver moving the
+ * page. So when trying to pin with FOLL_LONGTERM instead try
+ * to migrate the page out of device memory.
+ */
+ if (folio_is_device_coherent(folio)) {
+ /*
+ * We always want a new GUP lookup with device coherent
+ * pages.
+ */
+ pages[i] = 0;
+ coherent_pages = true;
+
+ /*
+ * Migration will fail if the page is pinned, so convert
+ * the pin on the source page to a normal reference.
+ */
+ if (gup_flags & FOLL_PIN) {
+ get_page(&folio->page);
+ unpin_user_page(&folio->page);
+ }
+
+ ret = migrate_device_coherent_page(&folio->page);
+ if (ret)
+ goto unpin_pages;
+
continue;
+ }
+ if (folio_is_longterm_pinnable(folio))
+ continue;
/*
* Try to move out any movable page before pinning the range.
*/
if (folio_test_hugetlb(folio)) {
- if (!isolate_huge_page(&folio->page,
+ if (isolate_hugetlb(&folio->page,
&movable_page_list))
isolation_error_count++;
continue;
@@ -1919,7 +1982,8 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
folio_nr_pages(folio));
}
- if (!list_empty(&movable_page_list) || isolation_error_count)
+ if (!list_empty(&movable_page_list) || isolation_error_count ||
+ coherent_pages)
goto unpin_pages;
/*
@@ -1929,10 +1993,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages,
return nr_pages;
unpin_pages:
- if (gup_flags & FOLL_PIN) {
- unpin_user_pages(pages, nr_pages);
- } else {
- for (i = 0; i < nr_pages; i++)
+ /*
+ * pages[i] might be NULL if any device coherent pages were found.
+ */
+ for (i = 0; i < nr_pages; i++) {
+ if (!pages[i])
+ continue;
+
+ if (gup_flags & FOLL_PIN)
+ unpin_user_page(pages[i]);
+ else
put_page(pages[i]);
}
diff --git a/mm/gup_test.c b/mm/gup_test.c
index d974dec19e1c..12b0a91767d3 100644
--- a/mm/gup_test.c
+++ b/mm/gup_test.c
@@ -53,7 +53,7 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
dump_page(page, "gup_test failure");
break;
} else if (cmd == PIN_LONGTERM_BENCHMARK &&
- WARN(!is_pinnable_page(page),
+ WARN(!is_longterm_pinnable_page(page),
"pages[%lu] is NOT pinnable but pinned\n",
i)) {
dump_page(page, "gup_test failure");
diff --git a/mm/highmem.c b/mm/highmem.c
index 1a692997fac4..c707d7202d5f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -150,7 +150,7 @@ struct page *__kmap_to_page(void *vaddr)
return pte_page(pkmap_page_table[i]);
}
- return virt_to_page(addr);
+ return virt_to_page(vaddr);
}
EXPORT_SYMBOL(__kmap_to_page);
@@ -561,7 +561,7 @@ void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
}
EXPORT_SYMBOL(__kmap_local_page_prot);
-void kunmap_local_indexed(void *vaddr)
+void kunmap_local_indexed(const void *vaddr)
{
unsigned long addr = (unsigned long) vaddr & PAGE_MASK;
pte_t *kmap_pte;
diff --git a/mm/hmm.c b/mm/hmm.c
index 3fd3242c5e50..f2aa63b94d9b 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -212,14 +212,6 @@ int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static inline bool hmm_is_device_private_entry(struct hmm_range *range,
- swp_entry_t entry)
-{
- return is_device_private_entry(entry) &&
- pfn_swap_entry_to_page(entry)->pgmap->owner ==
- range->dev_private_owner;
-}
-
static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
pte_t pte)
{
@@ -252,10 +244,12 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
/*
- * Never fault in device private pages, but just report
- * the PFN even if not present.
+ * Don't fault in device private pages owned by the caller,
+ * just report the PFN.
*/
- if (hmm_is_device_private_entry(range, entry)) {
+ if (is_device_private_entry(entry) &&
+ pfn_swap_entry_to_page(entry)->pgmap->owner ==
+ range->dev_private_owner) {
cpu_flags = HMM_PFN_VALID;
if (is_writable_device_private_entry(entry))
cpu_flags |= HMM_PFN_WRITE;
@@ -273,6 +267,9 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
if (!non_swap_entry(entry))
goto fault;
+ if (is_device_private_entry(entry))
+ goto fault;
+
if (is_device_exclusive_entry(entry))
goto fault;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7248002dad9..8a7c1b344abe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -18,6 +18,7 @@
#include <linux/shrinker.h>
#include <linux/mm_inline.h>
#include <linux/swapops.h>
+#include <linux/backing-dev.h>
#include <linux/dax.h>
#include <linux/khugepaged.h>
#include <linux/freezer.h>
@@ -69,21 +70,85 @@ static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_active(struct vm_area_struct *vma)
+bool hugepage_vma_check(struct vm_area_struct *vma,
+ unsigned long vm_flags,
+ bool smaps, bool in_pf)
{
- /* The addr is used to check if the vma size fits */
- unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+ if (!vma->vm_mm) /* vdso */
+ return false;
+
+ /*
+ * Explicitly disabled through madvise or prctl, or some
+ * architectures may disable THP for some mappings, for
+ * example, s390 kvm.
+ * */
+ if ((vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
+ /*
+ * If the hardware/firmware marked hugepage support disabled.
+ */
+ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX))
+ return false;
- if (!transhuge_vma_suitable(vma, addr))
+ /* khugepaged doesn't collapse DAX vma, but page fault is fine. */
+ if (vma_is_dax(vma))
+ return in_pf;
+
+ /*
+ * Special VMA and hugetlb VMA.
+ * Must be checked after dax since some dax mappings may have
+ * VM_MIXEDMAP set.
+ */
+ if (vm_flags & VM_NO_KHUGEPAGED)
+ return false;
+
+ /*
+ * Check alignment for file vma and size for both file and anon vma.
+ *
+ * Skip the check for page fault. Huge fault does the check in fault
+ * handlers. And this check is not suitable for huge PUD fault.
+ */
+ if (!in_pf &&
+ !transhuge_vma_suitable(vma, (vma->vm_end - HPAGE_PMD_SIZE)))
return false;
- if (vma_is_anonymous(vma))
- return __transparent_hugepage_enabled(vma);
- if (vma_is_shmem(vma))
+
+ /*
+ * Enabled via shmem mount options or sysfs settings.
+ * Must be done before hugepage flags check since shmem has its
+ * own flags.
+ */
+ if (!in_pf && shmem_file(vma->vm_file))
return shmem_huge_enabled(vma);
- if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma))
+
+ if (!hugepage_flags_enabled())
+ return false;
+
+ /* THP settings require madvise. */
+ if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
+ return false;
+
+ /* Only regular file is valid */
+ if (!in_pf && file_thp_enabled(vma))
return true;
- return false;
+ if (!vma_is_anonymous(vma))
+ return false;
+
+ if (vma_is_temporary_stack(vma))
+ return false;
+
+ /*
+ * THPeligible bit of smaps should show 1 for proper VMAs even
+ * though anon_vma is not initialized yet.
+ *
+ * Allow page fault since anon_vma may be not initialized until
+ * the first page fault.
+ */
+ if (!vma->anon_vma)
+ return (smaps || in_pf);
+
+ return true;
}
static bool get_huge_zero_page(void)
@@ -212,8 +277,8 @@ static ssize_t enabled_store(struct kobject *kobj,
}
return ret;
}
-static struct kobj_attribute enabled_attr =
- __ATTR(enabled, 0644, enabled_show, enabled_store);
+
+static struct kobj_attribute enabled_attr = __ATTR_RW(enabled);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf,
@@ -302,8 +367,7 @@ static ssize_t defrag_store(struct kobject *kobj,
return count;
}
-static struct kobj_attribute defrag_attr =
- __ATTR(defrag, 0644, defrag_show, defrag_store);
+static struct kobj_attribute defrag_attr = __ATTR_RW(defrag);
static ssize_t use_zero_page_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -317,8 +381,7 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
-static struct kobj_attribute use_zero_page_attr =
- __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
+static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
static ssize_t hpage_pmd_size_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
@@ -423,10 +486,10 @@ static int __init hugepage_init(void)
if (err)
goto err_slab;
- err = register_shrinker(&huge_zero_page_shrinker);
+ err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
if (err)
goto err_hzp_shrinker;
- err = register_shrinker(&deferred_split_shrinker);
+ err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
if (err)
goto err_split_shrinker;
@@ -519,7 +582,7 @@ static inline struct deferred_split *get_deferred_split_queue(struct page *page)
void prep_transhuge_page(struct page *page)
{
/*
- * we use page->mapping and page->indexlru in second tail page
+ * we use page->mapping and page->index in second tail page
* as list_head: assuming THP order >= 2
*/
@@ -726,7 +789,7 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
- khugepaged_enter(vma, vma->vm_flags);
+ khugepaged_enter_vma(vma, vma->vm_flags);
if (!(vmf->flags & FAULT_FLAG_WRITE) &&
!mm_forbids_zeropage(vma->vm_mm) &&
@@ -956,15 +1019,15 @@ EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, bool write)
{
pmd_t _pmd;
_pmd = pmd_mkyoung(*pmd);
- if (flags & FOLL_WRITE)
+ if (write)
_pmd = pmd_mkdirty(_pmd);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, flags & FOLL_WRITE))
+ pmd, _pmd, write))
update_mmu_cache_pmd(vma, addr, pmd);
}
@@ -997,7 +1060,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags);
+ touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
/*
* device mapped pages can only be returned if the
@@ -1120,15 +1183,15 @@ out:
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static void touch_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, bool write)
{
pud_t _pud;
_pud = pud_mkyoung(*pud);
- if (flags & FOLL_WRITE)
+ if (write)
_pud = pud_mkdirty(_pud);
if (pudp_set_access_flags(vma, addr & HPAGE_PUD_MASK,
- pud, _pud, flags & FOLL_WRITE))
+ pud, _pud, write))
update_mmu_cache_pud(vma, addr, pud);
}
@@ -1155,7 +1218,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
return NULL;
if (flags & FOLL_TOUCH)
- touch_pud(vma, addr, pud, flags);
+ touch_pud(vma, addr, pud, flags & FOLL_WRITE);
/*
* device mapped pages can only be returned if the
@@ -1220,21 +1283,13 @@ out_unlock:
void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud)
{
- pud_t entry;
- unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
vmf->ptl = pud_lock(vmf->vma->vm_mm, vmf->pud);
if (unlikely(!pud_same(*vmf->pud, orig_pud)))
goto unlock;
- entry = pud_mkyoung(orig_pud);
- if (write)
- entry = pud_mkdirty(entry);
- haddr = vmf->address & HPAGE_PUD_MASK;
- if (pudp_set_access_flags(vmf->vma, haddr, vmf->pud, entry, write))
- update_mmu_cache_pud(vmf->vma, vmf->address, vmf->pud);
-
+ touch_pud(vmf->vma, vmf->address, vmf->pud, write);
unlock:
spin_unlock(vmf->ptl);
}
@@ -1242,21 +1297,13 @@ unlock:
void huge_pmd_set_accessed(struct vm_fault *vmf)
{
- pmd_t entry;
- unsigned long haddr;
bool write = vmf->flags & FAULT_FLAG_WRITE;
- pmd_t orig_pmd = vmf->orig_pmd;
vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
+ if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
goto unlock;
- entry = pmd_mkyoung(orig_pmd);
- if (write)
- entry = pmd_mkdirty(entry);
- haddr = vmf->address & HPAGE_PMD_MASK;
- if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write))
- update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd);
+ touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
unlock:
spin_unlock(vmf->ptl);
@@ -1392,7 +1439,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-ENOMEM);
if (flags & FOLL_TOUCH)
- touch_pmd(vma, addr, pmd, flags);
+ touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
@@ -1685,7 +1732,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
pmd = move_soft_dirty_pmd(pmd);
set_pmd_at(mm, new_addr, new_pmd, pmd);
if (force_flush)
- flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ flush_pmd_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
spin_unlock(old_ptl);
@@ -1842,10 +1889,10 @@ spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
}
/*
- * Returns true if a given pud maps a thp, false otherwise.
+ * Returns page table lock pointer if a given pud maps a thp, NULL otherwise.
*
- * Note that if it returns true, this routine returns without unlocking page
- * table lock. So callers must unlock it.
+ * Note that if it returns page table lock pointer, this routine returns without
+ * unlocking page table lock. So callers must unlock it.
*/
spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma)
{
@@ -1867,12 +1914,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
ptl = __pud_trans_huge_lock(pud, vma);
if (!ptl)
return 0;
- /*
- * For architectures like ppc64 we look at deposited pgtable
- * when calling pudp_huge_get_and_clear. So do the
- * pgtable_trans_huge_withdraw after finishing pudp related
- * operations.
- */
+
pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
@@ -1937,7 +1979,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
* replacing a zero pmd write protected page with a zero pte write
* protected page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
pmdp_huge_clear_flush(vma, haddr, pmd);
@@ -2194,6 +2236,10 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd) ||
is_pmd_migration_entry(*pmd)) {
+ /*
+ * It's safe to call pmd_page when folio is set because it's
+ * guaranteed that pmd is present.
+ */
if (folio && folio != page_folio(pmd_page(*pmd)))
goto out;
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
@@ -2377,6 +2423,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
page_tail);
page_tail->mapping = head->mapping;
page_tail->index = head->index + tail;
+ page_tail->private = 0;
/* Page flags must be visible before we make the page non-compound. */
smp_wmb();
@@ -2439,11 +2486,15 @@ static void __split_huge_page(struct page *page, struct list_head *list,
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond EOF: drop them from page cache */
if (head[i].index >= end) {
- ClearPageDirty(head + i);
- __delete_from_page_cache(head + i, NULL);
+ struct folio *tail = page_folio(head + i);
+
if (shmem_mapping(head->mapping))
shmem_uncharge(head->mapping->host, 1);
- put_page(head + i);
+ else if (folio_test_clear_dirty(tail))
+ folio_account_cleaned(tail,
+ inode_to_wb(folio->mapping->host));
+ __filemap_remove_folio(tail, NULL);
+ folio_put(tail);
} else if (!PageAnon(page)) {
__xa_store(&head->mapping->i_pages, head[i].index,
head + i, 0);
@@ -2496,7 +2547,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
* requires taking the lru_lock so we do the put_page
* of the tail pages after the split is complete.
*/
- put_page(subpage);
+ free_page_and_swap_cache(subpage);
}
}
@@ -2815,9 +2866,12 @@ static void split_huge_pages_all(void)
unsigned long total = 0, split = 0;
pr_debug("Split all THPs\n");
- for_each_populated_zone(zone) {
+ for_each_zone(zone) {
+ if (!managed_zone(zone))
+ continue;
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
+ int nr_pages;
if (!pfn_valid(pfn))
continue;
@@ -2833,8 +2887,10 @@ static void split_huge_pages_all(void)
total++;
lock_page(page);
+ nr_pages = thp_nr_pages(page);
if (!split_huge_page(page))
split++;
+ pfn += nr_pages - 1;
unlock_page(page);
next:
put_page(page);
@@ -2892,10 +2948,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
* table filled with PTE-mapped THPs, each of which is distinct.
*/
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
- struct vm_area_struct *vma = find_vma(mm, addr);
+ struct vm_area_struct *vma = vma_lookup(mm, addr);
struct page *page;
- if (!vma || addr < vma->vm_start)
+ if (!vma)
break;
/* skip special VMA and hugetlb VMA */
@@ -2907,9 +2963,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
/* FOLL_DUMP to ignore special (like zero) pages */
page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
- if (IS_ERR(page))
- continue;
- if (!page)
+ if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
continue;
if (!is_transparent_hugepage(page))
@@ -3131,7 +3185,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
struct vm_area_struct *vma = pvmw->vma;
struct mm_struct *mm = vma->vm_mm;
unsigned long address = pvmw->address;
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
+ unsigned long haddr = address & HPAGE_PMD_MASK;
pmd_t pmde;
swp_entry_t entry;
@@ -3140,7 +3194,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
entry = pmd_to_swp_entry(*pvmw->pmd);
get_page(new);
- pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
+ pmde = pmd_mkold(mk_huge_pmd(new, READ_ONCE(vma->vm_page_prot)));
if (pmd_swp_soft_dirty(*pvmw->pmd))
pmde = pmd_mksoft_dirty(pmde);
if (is_writable_migration_entry(entry))
@@ -3154,12 +3208,12 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
if (!is_readable_migration_entry(entry))
rmap_flags |= RMAP_EXCLUSIVE;
- page_add_anon_rmap(new, vma, mmun_start, rmap_flags);
+ page_add_anon_rmap(new, vma, haddr, rmap_flags);
} else {
page_add_file_rmap(new, vma, true);
}
VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new));
- set_pmd_at(mm, mmun_start, pvmw->pmd, pmde);
+ set_pmd_at(mm, haddr, pvmw->pmd, pmde);
/* No need to invalidate - it was non-present before */
update_mmu_cache_pmd(vma, address, pvmw->pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a57e1be41401..f044962ad9df 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -66,12 +66,6 @@ static bool hugetlb_cma_page(struct page *page, unsigned int order)
#endif
static unsigned long hugetlb_cma_size __initdata;
-/*
- * Minimum page order among possible hugepage sizes, set to a proper value
- * at boot time.
- */
-static unsigned int minimum_order __read_mostly = UINT_MAX;
-
__initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
@@ -1135,7 +1129,7 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
lockdep_assert_held(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
- if (pin && !is_pinnable_page(page))
+ if (pin && !is_longterm_pinnable_page(page))
continue;
if (PageHWPoison(page))
@@ -2152,11 +2146,17 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
unsigned long pfn;
struct page *page;
int rc = 0;
+ unsigned int order;
+ struct hstate *h;
if (!hugepages_supported())
return rc;
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+ order = huge_page_order(&default_hstate);
+ for_each_hstate(h)
+ order = min(order, huge_page_order(h));
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) {
page = pfn_to_page(pfn);
rc = dissolve_free_huge_page(page);
if (rc)
@@ -2766,8 +2766,7 @@ retry:
* Fail with -EBUSY if not possible.
*/
spin_unlock_irq(&hugetlb_lock);
- if (!isolate_huge_page(old_page, list))
- ret = -EBUSY;
+ ret = isolate_hugetlb(old_page, list);
spin_lock_irq(&hugetlb_lock);
goto free_new;
} else if (!HPageFreed(old_page)) {
@@ -2843,7 +2842,7 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
if (hstate_is_gigantic(h))
return -ENOMEM;
- if (page_count(head) && isolate_huge_page(head, list))
+ if (page_count(head) && !isolate_hugetlb(head, list))
ret = 0;
else if (!page_count(head))
ret = alloc_and_dissolve_huge_page(h, head, list);
@@ -3149,9 +3148,6 @@ static void __init hugetlb_init_hstates(void)
struct hstate *h, *h2;
for_each_hstate(h) {
- if (minimum_order > huge_page_order(h))
- minimum_order = huge_page_order(h);
-
/* oversize hugepages were init'ed in early boot */
if (!hstate_is_gigantic(h))
hugetlb_hstate_alloc_pages(h);
@@ -3176,7 +3172,6 @@ static void __init hugetlb_init_hstates(void)
h->demote_order = h2->order;
}
}
- VM_BUG_ON(minimum_order == UINT_MAX);
}
static void __init report_hugepages(void)
@@ -4482,22 +4477,20 @@ int hugetlb_report_node_meminfo(char *buf, int len, int nid)
nid, h->surplus_huge_pages_node[nid]);
}
-void hugetlb_show_meminfo(void)
+void hugetlb_show_meminfo_node(int nid)
{
struct hstate *h;
- int nid;
if (!hugepages_supported())
return;
- for_each_node_state(nid, N_MEMORY)
- for_each_hstate(h)
- pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
- nid,
- h->nr_huge_pages_node[nid],
- h->free_huge_pages_node[nid],
- h->surplus_huge_pages_node[nid],
- huge_page_size(h) / SZ_1K);
+ for_each_hstate(h)
+ printk("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+ nid,
+ h->nr_huge_pages_node[nid],
+ h->free_huge_pages_node[nid],
+ h->surplus_huge_pages_node[nid],
+ huge_page_size(h) / SZ_1K);
}
void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm)
@@ -4732,6 +4725,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long npages = pages_per_huge_page(h);
struct address_space *mapping = src_vma->vm_file->f_mapping;
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
int ret = 0;
if (cow) {
@@ -4751,11 +4745,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
i_mmap_lock_read(mapping);
}
+ last_addr_mask = hugetlb_mask_last_page(h);
for (addr = src_vma->vm_start; addr < src_vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr, sz);
- if (!src_pte)
+ if (!src_pte) {
+ addr |= last_addr_mask;
continue;
+ }
dst_pte = huge_pte_alloc(dst, dst_vma, addr, sz);
if (!dst_pte) {
ret = -ENOMEM;
@@ -4772,8 +4769,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
* after taking the lock below.
*/
dst_entry = huge_ptep_get(dst_pte);
- if ((dst_pte == src_pte) || !huge_pte_none(dst_entry))
+ if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+ addr |= last_addr_mask;
continue;
+ }
dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
@@ -4788,8 +4787,13 @@ again:
* sharing with another vma.
*/
;
- } else if (unlikely(is_hugetlb_entry_migration(entry) ||
- is_hugetlb_entry_hwpoisoned(entry))) {
+ } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
+ bool uffd_wp = huge_pte_uffd_wp(entry);
+
+ if (!userfaultfd_wp(dst_vma) && uffd_wp)
+ entry = huge_pte_clear_uffd_wp(entry);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
+ } else if (unlikely(is_hugetlb_entry_migration(entry))) {
swp_entry_t swp_entry = pte_to_swp_entry(entry);
bool uffd_wp = huge_pte_uffd_wp(entry);
@@ -4803,12 +4807,11 @@ again:
entry = swp_entry_to_pte(swp_entry);
if (userfaultfd_wp(src_vma) && uffd_wp)
entry = huge_pte_mkuffd_wp(entry);
- set_huge_swap_pte_at(src, addr, src_pte,
- entry, sz);
+ set_huge_pte_at(src, addr, src_pte, entry);
}
if (!userfaultfd_wp(dst_vma) && uffd_wp)
entry = huge_pte_clear_uffd_wp(entry);
- set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
+ set_huge_pte_at(dst, addr, dst_pte, entry);
} else if (unlikely(is_pte_marker(entry))) {
/*
* We copy the pte marker only if the dst vma has
@@ -4875,7 +4878,7 @@ again:
* table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
huge_ptep_set_wrprotect(src, addr, src_pte);
entry = huge_pte_wrprotect(entry);
@@ -4934,7 +4937,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
unsigned long old_end = old_addr + len;
- unsigned long old_addr_copy;
+ unsigned long last_addr_mask;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;
bool shared_pmd = false;
@@ -4949,23 +4952,23 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
/* Prevent race with file truncation */
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
- if (!src_pte)
+ if (!src_pte) {
+ old_addr |= last_addr_mask;
+ new_addr |= last_addr_mask;
continue;
+ }
if (huge_pte_none(huge_ptep_get(src_pte)))
continue;
- /* old_addr arg to huge_pmd_unshare() is a pointer and so the
- * arg may be modified. Pass a copy instead to preserve the
- * value in old_addr.
- */
- old_addr_copy = old_addr;
-
- if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte)) {
+ if (huge_pmd_unshare(mm, vma, old_addr, src_pte)) {
shared_pmd = true;
+ old_addr |= last_addr_mask;
+ new_addr |= last_addr_mask;
continue;
}
@@ -4999,6 +5002,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
@@ -5019,17 +5023,21 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
- if (!ptep)
+ if (!ptep) {
+ address |= last_addr_mask;
continue;
+ }
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
spin_unlock(ptl);
tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
force_flush = true;
+ address |= last_addr_mask;
continue;
}
@@ -5414,19 +5422,25 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
pgoff_t idx)
{
+ struct folio *folio = page_folio(page);
struct inode *inode = mapping->host;
struct hstate *h = hstate_inode(inode);
- int err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+ int err;
- if (err)
+ __folio_set_locked(folio);
+ err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL);
+
+ if (unlikely(err)) {
+ __folio_clear_locked(folio);
return err;
+ }
ClearHPageRestoreReserve(page);
/*
- * set page dirty so that it will not be removed from cache/file
+ * mark folio dirty so that it will not be removed from cache/file
* by non-hugetlbfs specific code paths.
*/
- set_page_dirty(page);
+ folio_mark_dirty(folio);
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
@@ -5703,7 +5717,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
+ migration_entry_wait_huge(vma, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
@@ -5947,6 +5961,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
+ put_page(*pagep);
ret = -ENOMEM;
*pagep = NULL;
goto out;
@@ -6040,8 +6055,6 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
- (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte,
- dst_vma->vm_flags & VM_WRITE);
hugetlb_count_add(pages_per_huge_page(h), dst_mm);
/* No need to invalidate - it was non-present before */
@@ -6293,6 +6306,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
unsigned long pages = 0, psize = huge_page_size(h);
bool shared_pmd = false;
struct mmu_notifier_range range;
+ unsigned long last_addr_mask;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -6309,14 +6323,17 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, range.start, range.end);
mmu_notifier_invalidate_range_start(&range);
+ last_addr_mask = hugetlb_mask_last_page(h);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += psize) {
spinlock_t *ptl;
ptep = huge_pte_offset(mm, address, psize);
- if (!ptep)
+ if (!ptep) {
+ address |= last_addr_mask;
continue;
+ }
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, vma, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
/*
* When uffd-wp is enabled on the vma, unshare
* shouldn't happen at all. Warn about it if it
@@ -6326,6 +6343,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
pages++;
spin_unlock(ptl);
shared_pmd = true;
+ address |= last_addr_mask;
continue;
}
pte = huge_ptep_get(ptep);
@@ -6351,8 +6369,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
newpte = pte_swp_mkuffd_wp(newpte);
else if (uffd_wp_resolve)
newpte = pte_swp_clear_uffd_wp(newpte);
- set_huge_swap_pte_at(mm, address, ptep,
- newpte, psize);
+ set_huge_pte_at(mm, address, ptep, newpte);
pages++;
}
spin_unlock(ptl);
@@ -6403,7 +6420,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
* No need to call mmu_notifier_invalidate_range() we are downgrading
* page table protection not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
mmu_notifier_invalidate_range_end(&range);
@@ -6749,11 +6766,11 @@ out:
* 0 the underlying pte page is not shared, or it is the last user
*/
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
- pgd_t *pgd = pgd_offset(mm, *addr);
- p4d_t *p4d = p4d_offset(pgd, *addr);
- pud_t *pud = pud_offset(p4d, *addr);
+ pgd_t *pgd = pgd_offset(mm, addr);
+ p4d_t *p4d = p4d_offset(pgd, addr);
+ pud_t *pud = pud_offset(p4d, addr);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
@@ -6763,14 +6780,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
- /*
- * This update of passed address optimizes loops sequentially
- * processing addresses in increments of huge page size (PMD_SIZE
- * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
- * Update address to the 'last page' in the cleared area so that
- * calling loop can move to first page past this area.
- */
- *addr |= PUD_SIZE - PMD_SIZE;
return 1;
}
@@ -6782,7 +6791,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
}
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long *addr, pte_t *ptep)
+ unsigned long addr, pte_t *ptep)
{
return 0;
}
@@ -6865,6 +6874,37 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return (pte_t *)pmd;
}
+/*
+ * Return a mask that can be used to update an address to the last huge
+ * page in a page table page mapping size. Used to skip non-present
+ * page table entries when linearly scanning address ranges. Architectures
+ * with unique huge page to page table relationships can define their own
+ * version of this routine.
+ */
+unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+ unsigned long hp_size = huge_page_size(h);
+
+ if (hp_size == PUD_SIZE)
+ return P4D_SIZE - PUD_SIZE;
+ else if (hp_size == PMD_SIZE)
+ return PUD_SIZE - PMD_SIZE;
+ else
+ return 0UL;
+}
+
+#else
+
+/* See description above. Architectures can provide their own version. */
+__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
+{
+#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+ if (huge_page_size(h) == PMD_SIZE)
+ return PUD_SIZE - PMD_SIZE;
+#endif
+ return 0UL;
+}
+
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
/*
@@ -6928,7 +6968,7 @@ retry:
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
- __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+ __migration_entry_wait_huge((pte_t *)pmd, ptl);
goto retry;
}
/*
@@ -6960,15 +7000,15 @@ follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int fla
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
}
-bool isolate_huge_page(struct page *page, struct list_head *list)
+int isolate_hugetlb(struct page *page, struct list_head *list)
{
- bool ret = true;
+ int ret = 0;
spin_lock_irq(&hugetlb_lock);
if (!PageHeadHuge(page) ||
!HPageMigratable(page) ||
!get_page_unless_zero(page)) {
- ret = false;
+ ret = -EBUSY;
goto unlock;
}
ClearHPageMigratable(page);
@@ -7088,21 +7128,18 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (address = start; address < end; address += PUD_SIZE) {
- unsigned long tmp = address;
-
ptep = huge_pte_offset(mm, address, sz);
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- /* We don't want 'address' to be changed */
- huge_pmd_unshare(mm, vma, &tmp, ptep);
+ huge_pmd_unshare(mm, vma, address, ptep);
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
/*
* No need to call mmu_notifier_invalidate_range(), see
- * Documentation/vm/mmu_notifier.rst.
+ * Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
}
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index f9942841df18..c86691c431fd 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -772,6 +772,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
/* Add the numa stat file */
cft = &h->cgroup_files_dfl[6];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
+ cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_cgroup_read_numa_stat;
cft->flags = CFTYPE_NOT_ON_ROOT;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 1089ea8a9c98..1362feb3c6c9 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -6,11 +6,11 @@
*
* Author: Muchun Song <songmuchun@bytedance.com>
*
- * See Documentation/vm/vmemmap_dedup.rst
+ * See Documentation/mm/vmemmap_dedup.rst
*/
#define pr_fmt(fmt) "HugeTLB: " fmt
-#include <linux/memory_hotplug.h>
+#include <linux/memory.h>
#include "hugetlb_vmemmap.h"
/*
@@ -97,18 +97,68 @@ int hugetlb_vmemmap_alloc(struct hstate *h, struct page *head)
return ret;
}
+static unsigned int vmemmap_optimizable_pages(struct hstate *h,
+ struct page *head)
+{
+ if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
+ return 0;
+
+ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG)) {
+ pmd_t *pmdp, pmd;
+ struct page *vmemmap_page;
+ unsigned long vaddr = (unsigned long)head;
+
+ /*
+ * Only the vmemmap page's vmemmap page can be self-hosted.
+ * Walking the page tables to find the backing page of the
+ * vmemmap page.
+ */
+ pmdp = pmd_off_k(vaddr);
+ /*
+ * The READ_ONCE() is used to stabilize *pmdp in a register or
+ * on the stack so that it will stop changing under the code.
+ * The only concurrent operation where it can be changed is
+ * split_vmemmap_huge_pmd() (*pmdp will be stable after this
+ * operation).
+ */
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_leaf(pmd))
+ vmemmap_page = pmd_page(pmd) + pte_index(vaddr);
+ else
+ vmemmap_page = pte_page(*pte_offset_kernel(pmdp, vaddr));
+ /*
+ * Due to HugeTLB alignment requirements and the vmemmap pages
+ * being at the start of the hotplugged memory region in
+ * memory_hotplug.memmap_on_memory case. Checking any vmemmap
+ * page's vmemmap page if it is marked as VmemmapSelfHosted is
+ * sufficient.
+ *
+ * [ hotplugged memory ]
+ * [ section ][...][ section ]
+ * [ vmemmap ][ usable memory ]
+ * ^ | | |
+ * +---+ | |
+ * ^ | |
+ * +-------+ |
+ * ^ |
+ * +-------------------------------------------+
+ */
+ if (PageVmemmapSelfHosted(vmemmap_page))
+ return 0;
+ }
+
+ return hugetlb_optimize_vmemmap_pages(h);
+}
+
void hugetlb_vmemmap_free(struct hstate *h, struct page *head)
{
unsigned long vmemmap_addr = (unsigned long)head;
unsigned long vmemmap_end, vmemmap_reuse, vmemmap_pages;
- vmemmap_pages = hugetlb_optimize_vmemmap_pages(h);
+ vmemmap_pages = vmemmap_optimizable_pages(h, head);
if (!vmemmap_pages)
return;
- if (READ_ONCE(vmemmap_optimize_mode) == VMEMMAP_OPTIMIZE_OFF)
- return;
-
static_branch_inc(&hugetlb_optimize_vmemmap_key);
vmemmap_addr += RESERVE_VMEMMAP_SIZE;
@@ -199,10 +249,10 @@ static struct ctl_table hugetlb_vmemmap_sysctls[] = {
static __init int hugetlb_vmemmap_sysctls_init(void)
{
/*
- * If "memory_hotplug.memmap_on_memory" is enabled or "struct page"
- * crosses page boundaries, the vmemmap pages cannot be optimized.
+ * If "struct page" crosses page boundaries, the vmemmap pages cannot
+ * be optimized.
*/
- if (!mhp_memmap_on_memory() && is_power_of_2(sizeof(struct page)))
+ if (is_power_of_2(sizeof(struct page)))
register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
return 0;
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 5c0cddd81505..65e242b5a432 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -48,7 +48,7 @@ static int hwpoison_inject(void *data, u64 val)
inject:
pr_info("Injecting memory failure at pfn %#lx\n", pfn);
- err = memory_failure(pfn, 0);
+ err = memory_failure(pfn, MF_SW_SIMULATED);
return (err == -EOPNOTSUPP) ? 0 : err;
}
diff --git a/mm/internal.h b/mm/internal.h
index c0f8fbe0445b..785409805ed7 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -853,6 +853,7 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
void free_zone_device_page(struct page *page);
+int migrate_device_coherent_page(struct page *page);
/*
* mm/gup.c
@@ -861,4 +862,24 @@ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags);
DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
+extern bool mirrored_kernelcore;
+
+static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma)
+{
+ /*
+ * NOTE: we must check this before VM_SOFTDIRTY on soft-dirty
+ * enablements, because when without soft-dirty being compiled in,
+ * VM_SOFTDIRTY is defined as 0x0, then !(vm_flags & VM_SOFTDIRTY)
+ * will be constantly true.
+ */
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return false;
+
+ /*
+ * Soft-dirty is kind of special: its tracking is enabled when the
+ * vma flags not set.
+ */
+ return !(vma->vm_flags & VM_SOFTDIRTY);
+}
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/ioremap.c b/mm/ioremap.c
index 5fe598ecd9b7..8652426282cc 100644
--- a/mm/ioremap.c
+++ b/mm/ioremap.c
@@ -11,29 +11,35 @@
#include <linux/io.h>
#include <linux/export.h>
-void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
+ unsigned long prot)
{
unsigned long offset, vaddr;
phys_addr_t last_addr;
struct vm_struct *area;
/* Disallow wrap-around or zero size */
- last_addr = addr + size - 1;
- if (!size || last_addr < addr)
+ last_addr = phys_addr + size - 1;
+ if (!size || last_addr < phys_addr)
return NULL;
/* Page-align mappings */
- offset = addr & (~PAGE_MASK);
- addr -= offset;
+ offset = phys_addr & (~PAGE_MASK);
+ phys_addr -= offset;
size = PAGE_ALIGN(size + offset);
+ if (!ioremap_allowed(phys_addr, size, prot))
+ return NULL;
+
area = get_vm_area_caller(size, VM_IOREMAP,
__builtin_return_address(0));
if (!area)
return NULL;
vaddr = (unsigned long)area->addr;
+ area->phys_addr = phys_addr;
- if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+ if (ioremap_page_range(vaddr, vaddr + size, phys_addr,
+ __pgprot(prot))) {
free_vm_area(area);
return NULL;
}
@@ -44,6 +50,12 @@ EXPORT_SYMBOL(ioremap_prot);
void iounmap(volatile void __iomem *addr)
{
- vunmap((void *)((unsigned long)addr & PAGE_MASK));
+ void *vaddr = (void *)((unsigned long)addr & PAGE_MASK);
+
+ if (!iounmap_allowed(vaddr))
+ return;
+
+ if (is_vmalloc_addr(vaddr))
+ vunmap(vaddr);
}
EXPORT_SYMBOL(iounmap);
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index c40c0e7b3b5f..69f583855c8b 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -108,9 +108,10 @@ void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init)
return;
tag = kasan_random_tag();
+ kasan_unpoison(set_tag(page_address(page), tag),
+ PAGE_SIZE << order, init);
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison(page_address(page), PAGE_SIZE << order, init);
}
void __kasan_poison_pages(struct page *page, unsigned int order, bool init)
@@ -343,7 +344,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
object)) {
- kasan_report_invalid_free(tagged_object, ip);
+ kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_INVALID_FREE);
return true;
}
@@ -352,7 +353,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
return false;
if (!kasan_byte_accessible(tagged_object)) {
- kasan_report_invalid_free(tagged_object, ip);
+ kasan_report_invalid_free(tagged_object, ip, KASAN_REPORT_DOUBLE_FREE);
return true;
}
@@ -377,12 +378,12 @@ bool __kasan_slab_free(struct kmem_cache *cache, void *object,
static inline bool ____kasan_kfree_large(void *ptr, unsigned long ip)
{
if (ptr != page_address(virt_to_head_page(ptr))) {
- kasan_report_invalid_free(ptr, ip);
+ kasan_report_invalid_free(ptr, ip, KASAN_REPORT_INVALID_FREE);
return true;
}
if (!kasan_byte_accessible(ptr)) {
- kasan_report_invalid_free(ptr, ip);
+ kasan_report_invalid_free(ptr, ip, KASAN_REPORT_DOUBLE_FREE);
return true;
}
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
index 9e1b6544bfa8..9ad8eff71b28 100644
--- a/mm/kasan/hw_tags.c
+++ b/mm/kasan/hw_tags.c
@@ -257,27 +257,37 @@ static void unpoison_vmalloc_pages(const void *addr, u8 tag)
}
}
+static void init_vmalloc_pages(const void *start, unsigned long size)
+{
+ const void *addr;
+
+ for (addr = start; addr < start + size; addr += PAGE_SIZE) {
+ struct page *page = virt_to_page(addr);
+
+ clear_highpage_kasan_tagged(page);
+ }
+}
+
void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
kasan_vmalloc_flags_t flags)
{
u8 tag;
unsigned long redzone_start, redzone_size;
- if (!kasan_vmalloc_enabled())
- return (void *)start;
-
- if (!is_vmalloc_or_module_addr(start))
+ if (!kasan_vmalloc_enabled() || !is_vmalloc_or_module_addr(start)) {
+ if (flags & KASAN_VMALLOC_INIT)
+ init_vmalloc_pages(start, size);
return (void *)start;
+ }
/*
- * Skip unpoisoning and assigning a pointer tag for non-VM_ALLOC
- * mappings as:
+ * Don't tag non-VM_ALLOC mappings, as:
*
* 1. Unlike the software KASAN modes, hardware tag-based KASAN only
* supports tagging physical memory. Therefore, it can only tag a
* single mapping of normal physical pages.
* 2. Hardware tag-based KASAN can only tag memory mapped with special
- * mapping protection bits, see arch_vmalloc_pgprot_modify().
+ * mapping protection bits, see arch_vmap_pgprot_tagged().
* As non-VM_ALLOC mappings can be mapped outside of vmalloc code,
* providing these bits would require tracking all non-VM_ALLOC
* mappers.
@@ -289,15 +299,19 @@ void *__kasan_unpoison_vmalloc(const void *start, unsigned long size,
*
* For non-VM_ALLOC allocations, page_alloc memory is tagged as usual.
*/
- if (!(flags & KASAN_VMALLOC_VM_ALLOC))
+ if (!(flags & KASAN_VMALLOC_VM_ALLOC)) {
+ WARN_ON(flags & KASAN_VMALLOC_INIT);
return (void *)start;
+ }
/*
* Don't tag executable memory.
* The kernel doesn't tolerate having the PC register tagged.
*/
- if (!(flags & KASAN_VMALLOC_PROT_NORMAL))
+ if (!(flags & KASAN_VMALLOC_PROT_NORMAL)) {
+ WARN_ON(flags & KASAN_VMALLOC_INIT);
return (void *)start;
+ }
tag = kasan_random_tag();
start = set_tag(start, tag);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 610d60d6e5b8..01c03e45acd4 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -125,6 +125,7 @@ static inline bool kasan_sync_fault_possible(void)
enum kasan_report_type {
KASAN_REPORT_ACCESS,
KASAN_REPORT_INVALID_FREE,
+ KASAN_REPORT_DOUBLE_FREE,
};
struct kasan_report_info {
@@ -277,7 +278,7 @@ static inline void kasan_print_address_stack_frame(const void *addr) { }
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
-void kasan_report_invalid_free(void *object, unsigned long ip);
+void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type);
struct page *kasan_addr_to_page(const void *addr);
struct slab *kasan_addr_to_slab(const void *addr);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index b341a191651d..fe3f606b3a98 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -176,8 +176,12 @@ static void end_report(unsigned long *flags, void *addr)
static void print_error_description(struct kasan_report_info *info)
{
if (info->type == KASAN_REPORT_INVALID_FREE) {
- pr_err("BUG: KASAN: double-free or invalid-free in %pS\n",
- (void *)info->ip);
+ pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip);
+ return;
+ }
+
+ if (info->type == KASAN_REPORT_DOUBLE_FREE) {
+ pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip);
return;
}
@@ -433,7 +437,7 @@ static void print_report(struct kasan_report_info *info)
}
}
-void kasan_report_invalid_free(void *ptr, unsigned long ip)
+void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type)
{
unsigned long flags;
struct kasan_report_info info;
@@ -448,7 +452,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip)
start_report(&flags, true);
- info.type = KASAN_REPORT_INVALID_FREE;
+ info.type = type;
info.access_addr = ptr;
info.first_bad_addr = kasan_reset_tag(ptr);
info.access_size = 0;
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
index a4f07de21771..0e3648b603a6 100644
--- a/mm/kasan/shadow.c
+++ b/mm/kasan/shadow.c
@@ -295,9 +295,22 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
return 0;
shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
- shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
- shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+
+ /*
+ * User Mode Linux maps enough shadow memory for all of virtual memory
+ * at boot, so doesn't need to allocate more on vmalloc, just clear it.
+ *
+ * The remaining CONFIG_UML checks in this file exist for the same
+ * reason.
+ */
+ if (IS_ENABLED(CONFIG_UML)) {
+ __memset((void *)shadow_start, KASAN_VMALLOC_INVALID, shadow_end - shadow_start);
+ return 0;
+ }
+
+ shadow_start = PAGE_ALIGN_DOWN(shadow_start);
+ shadow_end = PAGE_ALIGN(shadow_end);
ret = apply_to_page_range(&init_mm, shadow_start,
shadow_end - shadow_start,
@@ -466,6 +479,10 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end,
if (shadow_end > shadow_start) {
size = shadow_end - shadow_start;
+ if (IS_ENABLED(CONFIG_UML)) {
+ __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start);
+ return;
+ }
apply_to_existing_page_range(&init_mm,
(unsigned long)shadow_start,
size, kasan_depopulate_vmalloc_pte,
@@ -531,6 +548,11 @@ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
return -EINVAL;
+ if (IS_ENABLED(CONFIG_UML)) {
+ __memset((void *)shadow_start, KASAN_SHADOW_INIT, shadow_size);
+ return 0;
+ }
+
ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
shadow_start + shadow_size,
GFP_KERNEL,
@@ -554,6 +576,9 @@ int kasan_alloc_module_shadow(void *addr, size_t size, gfp_t gfp_mask)
void kasan_free_module_shadow(const struct vm_struct *vm)
{
+ if (IS_ENABLED(CONFIG_UML))
+ return;
+
if (vm->flags & VM_KASAN)
vfree(kasan_mem_to_shadow(vm->addr));
}
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 4e7cd4c8e687..c252081b11df 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -360,6 +360,9 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
unsigned long flags;
struct slab *slab;
void *addr;
+ const bool random_right_allocate = prandom_u32_max(2);
+ const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS &&
+ !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS);
/* Try to obtain a free object. */
raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
@@ -404,7 +407,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
* is that the out-of-bounds accesses detected are deterministic for
* such allocations.
*/
- if (prandom_u32_max(2)) {
+ if (random_right_allocate) {
/* Allocate on the "right" side, re-calculate address. */
meta->addr += PAGE_SIZE - size;
meta->addr = ALIGN_DOWN(meta->addr, cache->align);
@@ -444,7 +447,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
if (cache->ctor)
cache->ctor(addr);
- if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS))
+ if (random_fault)
kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
@@ -543,7 +546,7 @@ static unsigned long kfence_init_pool(void)
if (!arch_kfence_init_pool())
return addr;
- pages = virt_to_page(addr);
+ pages = virt_to_page(__kfence_pool);
/*
* Set up object pages: they must have PG_slab set, to avoid freeing
@@ -600,14 +603,6 @@ static unsigned long kfence_init_pool(void)
addr += 2 * PAGE_SIZE;
}
- /*
- * The pool is live and will never be deallocated from this point on.
- * Remove the pool object from the kmemleak object tree, as it would
- * otherwise overlap with allocations returned by kfence_alloc(), which
- * are registered with kmemleak through the slab post-alloc hook.
- */
- kmemleak_free(__kfence_pool);
-
return 0;
}
@@ -620,8 +615,16 @@ static bool __init kfence_init_pool_early(void)
addr = kfence_init_pool();
- if (!addr)
+ if (!addr) {
+ /*
+ * The pool is live and will never be deallocated from this point on.
+ * Ignore the pool object from the kmemleak phys object tree, as it would
+ * otherwise overlap with allocations returned by kfence_alloc(), which
+ * are registered with kmemleak through the slab post-alloc hook.
+ */
+ kmemleak_ignore_phys(__pa(__kfence_pool));
return true;
+ }
/*
* Only release unprotected pages, and do not try to go back and change
@@ -657,7 +660,7 @@ static bool kfence_init_pool_late(void)
/* Same as above. */
free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool);
#ifdef CONFIG_CONTIG_ALLOC
- free_contig_range(page_to_pfn(virt_to_page(addr)), free_size / PAGE_SIZE);
+ free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE);
#else
free_pages_exact((void *)addr, free_size);
#endif
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 16be62d493cd..01f71786d530 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -147,8 +147,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute scan_sleep_millisecs_attr =
- __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
- scan_sleep_millisecs_store);
+ __ATTR_RW(scan_sleep_millisecs);
static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -175,8 +174,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute alloc_sleep_millisecs_attr =
- __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
- alloc_sleep_millisecs_store);
+ __ATTR_RW(alloc_sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -200,8 +198,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute pages_to_scan_attr =
- __ATTR(pages_to_scan, 0644, pages_to_scan_show,
- pages_to_scan_store);
+ __ATTR_RW(pages_to_scan);
static ssize_t pages_collapsed_show(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -221,22 +218,21 @@ static ssize_t full_scans_show(struct kobject *kobj,
static struct kobj_attribute full_scans_attr =
__ATTR_RO(full_scans);
-static ssize_t khugepaged_defrag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t defrag_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
-static ssize_t khugepaged_defrag_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t defrag_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
return single_hugepage_flag_store(kobj, attr, buf, count,
TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static struct kobj_attribute khugepaged_defrag_attr =
- __ATTR(defrag, 0644, khugepaged_defrag_show,
- khugepaged_defrag_store);
+ __ATTR_RW(defrag);
/*
* max_ptes_none controls if khugepaged should collapse hugepages over
@@ -246,21 +242,21 @@ static struct kobj_attribute khugepaged_defrag_attr =
* runs. Increasing max_ptes_none will instead potentially reduce the
* free memory in the system during the khugepaged scan.
*/
-static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_none_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
-static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_none_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_none;
err = kstrtoul(buf, 10, &max_ptes_none);
- if (err || max_ptes_none > HPAGE_PMD_NR-1)
+ if (err || max_ptes_none > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_none = max_ptes_none;
@@ -268,25 +264,24 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
return count;
}
static struct kobj_attribute khugepaged_max_ptes_none_attr =
- __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
- khugepaged_max_ptes_none_store);
+ __ATTR_RW(max_ptes_none);
-static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_swap_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}
-static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_swap_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_swap;
err = kstrtoul(buf, 10, &max_ptes_swap);
- if (err || max_ptes_swap > HPAGE_PMD_NR-1)
+ if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_swap = max_ptes_swap;
@@ -295,25 +290,24 @@ static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
}
static struct kobj_attribute khugepaged_max_ptes_swap_attr =
- __ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
- khugepaged_max_ptes_swap_store);
+ __ATTR_RW(max_ptes_swap);
-static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+static ssize_t max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
{
return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}
-static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int err;
unsigned long max_ptes_shared;
err = kstrtoul(buf, 10, &max_ptes_shared);
- if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
return -EINVAL;
khugepaged_max_ptes_shared = max_ptes_shared;
@@ -322,8 +316,7 @@ static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
}
static struct kobj_attribute khugepaged_max_ptes_shared_attr =
- __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
- khugepaged_max_ptes_shared_store);
+ __ATTR_RW(max_ptes_shared);
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
@@ -437,43 +430,6 @@ static inline int khugepaged_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-bool hugepage_vma_check(struct vm_area_struct *vma,
- unsigned long vm_flags)
-{
- if (!transhuge_vma_enabled(vma, vm_flags))
- return false;
-
- if (vm_flags & VM_NO_KHUGEPAGED)
- return false;
-
- /* Don't run khugepaged against DAX vma */
- if (vma_is_dax(vma))
- return false;
-
- if (vma->vm_file && !IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) -
- vma->vm_pgoff, HPAGE_PMD_NR))
- return false;
-
- /* Enabled via shmem mount options or sysfs settings. */
- if (shmem_file(vma->vm_file))
- return shmem_huge_enabled(vma);
-
- /* THP settings require madvise. */
- if (!(vm_flags & VM_HUGEPAGE) && !khugepaged_always())
- return false;
-
- /* Only regular file is valid */
- if (file_thp_enabled(vma))
- return true;
-
- if (!vma->anon_vma || !vma_is_anonymous(vma))
- return false;
- if (vma_is_temporary_stack(vma))
- return false;
-
- return true;
-}
-
void __khugepaged_enter(struct mm_struct *mm)
{
struct mm_slot *mm_slot;
@@ -509,10 +465,8 @@ void khugepaged_enter_vma(struct vm_area_struct *vma,
unsigned long vm_flags)
{
if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
- khugepaged_enabled() &&
- (((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK))) {
- if (hugepage_vma_check(vma, vm_flags))
+ hugepage_flags_enabled()) {
+ if (hugepage_vma_check(vma, vm_flags, false, false))
__khugepaged_enter(vma->vm_mm);
}
}
@@ -599,7 +553,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
- for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (pte_none(pteval) || (pte_present(pteval) &&
@@ -618,7 +572,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
page = vm_normal_page(vma, address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out;
}
@@ -762,7 +716,12 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
list_del(&src_page->lru);
- release_pte_page(src_page);
+ mod_node_page_state(page_pgdat(src_page),
+ NR_ISOLATED_ANON + page_is_file_lru(src_page),
+ -compound_nr(src_page));
+ unlock_page(src_page);
+ free_swap_cache(src_page);
+ putback_lru_page(src_page);
}
}
@@ -802,6 +761,10 @@ static bool khugepaged_scan_abort(int nid)
return false;
}
+#define khugepaged_defrag() \
+ (transparent_hugepage_flags & \
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
+
/* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
{
@@ -899,7 +862,7 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
khugepaged_alloc_sleep();
} else
count_vm_event(THP_COLLAPSE_ALLOC);
- } while (unlikely(!hpage) && likely(khugepaged_enabled()));
+ } while (unlikely(!hpage) && likely(hugepage_flags_enabled()));
return hpage;
}
@@ -947,7 +910,6 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
struct vm_area_struct **vmap)
{
struct vm_area_struct *vma;
- unsigned long hstart, hend;
if (unlikely(khugepaged_test_exit(mm)))
return SCAN_ANY_PROCESS;
@@ -956,13 +918,17 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
if (!vma)
return SCAN_VMA_NULL;
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+ if (!transhuge_vma_suitable(vma, address))
return SCAN_ADDRESS_RANGE;
- if (!hugepage_vma_check(vma, vma->vm_flags))
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false))
return SCAN_VMA_CHECK;
- /* Anon VMA expected */
+ /*
+ * Anon VMA expected, the address may be unmapped then
+ * remapped to file after khugepaged reaquired the mmap_lock.
+ *
+ * hugepage_vma_check may return true for qualified file
+ * vmas.
+ */
if (!vma->anon_vma || !vma_is_anonymous(vma))
return SCAN_VMA_CHECK;
return 0;
@@ -972,8 +938,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
* Bring missing pages in from swap, to complete THP collapse.
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
- * Called and returns without pte mapped or spinlocks held,
- * but with mmap_lock held to protect against vma changes.
+ * Called and returns without pte mapped or spinlocks held.
+ * Note that if false is returned, mmap_lock will be released.
*/
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
@@ -1000,27 +966,24 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
pte_unmap(vmf.pte);
continue;
}
- swapped_in++;
ret = do_swap_page(&vmf);
- /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
+ /*
+ * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
+ * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
+ * we do not retry here and swap entry will remain in pagetable
+ * resulting in later failure.
+ */
if (ret & VM_FAULT_RETRY) {
- mmap_read_lock(mm);
- if (hugepage_vma_revalidate(mm, haddr, &vma)) {
- /* vma is no longer available, don't continue to swapin */
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
- /* check if the pmd is still valid */
- if (mm_find_pmd(mm, haddr) != pmd) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
+ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
+ return false;
}
if (ret & VM_FAULT_ERROR) {
+ mmap_read_unlock(mm);
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
+ swapped_in++;
}
/* Drain LRU add pagevec to remove extra pin on the swapped in pages */
@@ -1086,13 +1049,12 @@ static void collapse_huge_page(struct mm_struct *mm,
}
/*
- * __collapse_huge_page_swapin always returns with mmap_lock locked.
- * If it fails, we release mmap_lock and jump out_nolock.
+ * __collapse_huge_page_swapin will return with mmap_lock released
+ * when it fails. So we jump out_nolock directly in that case.
* Continuing to collapse causes inconsistency.
*/
if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
pmd, referenced)) {
- mmap_read_unlock(mm);
goto out_nolock;
}
@@ -1219,7 +1181,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
- for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+ for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
pte_t pteval = *_pte;
if (is_swap_pte(pteval)) {
@@ -1267,7 +1229,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
writable = true;
page = vm_normal_page(vma, _address, pteval);
- if (unlikely(!page)) {
+ if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
result = SCAN_PAGE_NULL;
goto out_unmap;
}
@@ -1309,7 +1271,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
/*
* Check if the page has any GUP (or other external) pins.
*
- * Here the check is racy it may see totmal_mapcount > refcount
+ * Here the check is racy it may see total_mapcount > refcount
* in some cases.
* For example, one process with one forked child process.
* The parent has the PMD split due to MADV_DONTNEED, then
@@ -1382,8 +1344,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
* Notify khugepaged that given addr of the mm is pte-mapped THP. Then
* khugepaged should try to collapse the page table.
*/
-static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
- unsigned long addr)
+static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
{
struct mm_slot *mm_slot;
@@ -1394,7 +1356,6 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
spin_unlock(&khugepaged_mm_lock);
- return 0;
}
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -1444,7 +1405,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
* the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
* will not fail the vma for missing VM_HUGEPAGE
*/
- if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false))
return;
/* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
@@ -1479,7 +1440,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
goto abort;
page = vm_normal_page(vma, addr, *pte);
-
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ page = NULL;
/*
* Note that uprobe, debugger, or MAP_PRIVATE may change the
* page table, but the new page will not be a subpage of hpage.
@@ -1497,6 +1459,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (pte_none(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
+ if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+ goto abort;
page_remove_rmap(page, vma, false);
}
@@ -1557,7 +1521,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* mmap_write_lock(mm) as PMD-mapping is likely to be split
* later.
*
- * Not that vma->anon_vma check is racy: it can be set up after
+ * Note that vma->anon_vma check is racy: it can be set up after
* the check but before we took mmap_lock by the fault path.
* But page lock would prevent establishing any new ptes of the
* page, so we are safe.
@@ -1885,8 +1849,8 @@ out_unlock:
if (nr_none) {
__mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
- if (is_shmem)
- __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
+ /* nr_none is always 0 for non-shmem. */
+ __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
/* Join all the small entries into a single multi-index entry */
@@ -1950,10 +1914,10 @@ xa_unlocked:
/* Something went wrong: roll back page cache changes */
xas_lock_irq(&xas);
- mapping->nrpages -= nr_none;
-
- if (is_shmem)
+ if (nr_none) {
+ mapping->nrpages -= nr_none;
shmem_uncharge(mapping->host, nr_none);
+ }
xas_set(&xas, start);
xas_for_each(&xas, page, end - 1) {
@@ -2131,22 +2095,18 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
progress++;
break;
}
- if (!hugepage_vma_check(vma, vma->vm_flags)) {
+ if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) {
skip:
progress++;
continue;
}
- hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
- hend = vma->vm_end & HPAGE_PMD_MASK;
- if (hstart >= hend)
- goto skip;
+ hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
+ hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
if (khugepaged_scan.address > hend)
goto skip;
if (khugepaged_scan.address < hstart)
khugepaged_scan.address = hstart;
VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
- if (shmem_file(vma->vm_file) && !shmem_huge_enabled(vma))
- goto skip;
while (khugepaged_scan.address < hend) {
int ret;
@@ -2216,7 +2176,7 @@ breakouterloop_mmap_lock:
static int khugepaged_has_work(void)
{
return !list_empty(&khugepaged_scan.mm_head) &&
- khugepaged_enabled();
+ hugepage_flags_enabled();
}
static int khugepaged_wait_event(void)
@@ -2281,7 +2241,7 @@ static void khugepaged_wait_work(void)
return;
}
- if (khugepaged_enabled())
+ if (hugepage_flags_enabled())
wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
}
@@ -2312,7 +2272,7 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
- if (!khugepaged_enabled()) {
+ if (!hugepage_flags_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
@@ -2362,7 +2322,7 @@ int start_stop_khugepaged(void)
int err = 0;
mutex_lock(&khugepaged_mutex);
- if (khugepaged_enabled()) {
+ if (hugepage_flags_enabled()) {
if (!khugepaged_thread)
khugepaged_thread = kthread_run(khugepaged, NULL,
"khugepaged");
@@ -2388,7 +2348,7 @@ fail:
void khugepaged_min_free_kbytes_update(void)
{
mutex_lock(&khugepaged_mutex);
- if (khugepaged_enabled() && khugepaged_thread)
+ if (hugepage_flags_enabled() && khugepaged_thread)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a182f5ddaf68..1eddc0132f7f 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -14,14 +14,16 @@
* The following locks and mutexes are used by kmemleak:
*
* - kmemleak_lock (raw_spinlock_t): protects the object_list modifications and
- * accesses to the object_tree_root. The object_list is the main list
- * holding the metadata (struct kmemleak_object) for the allocated memory
- * blocks. The object_tree_root is a red black tree used to look-up
- * metadata based on a pointer to the corresponding memory block. The
- * kmemleak_object structures are added to the object_list and
- * object_tree_root in the create_object() function called from the
- * kmemleak_alloc() callback and removed in delete_object() called from the
- * kmemleak_free() callback
+ * accesses to the object_tree_root (or object_phys_tree_root). The
+ * object_list is the main list holding the metadata (struct kmemleak_object)
+ * for the allocated memory blocks. The object_tree_root and object_phys_tree_root
+ * are red black trees used to look-up metadata based on a pointer to the
+ * corresponding memory block. The object_phys_tree_root is for objects
+ * allocated with physical address. The kmemleak_object structures are
+ * added to the object_list and object_tree_root (or object_phys_tree_root)
+ * in the create_object() function called from the kmemleak_alloc() (or
+ * kmemleak_alloc_phys()) callback and removed in delete_object() called from
+ * the kmemleak_free() callback
* - kmemleak_object.lock (raw_spinlock_t): protects a kmemleak_object.
* Accesses to the metadata (e.g. count) are protected by this lock. Note
* that some members of this structure may be protected by other means
@@ -172,6 +174,8 @@ struct kmemleak_object {
#define OBJECT_NO_SCAN (1 << 2)
/* flag set to fully scan the object when scan_area allocation failed */
#define OBJECT_FULL_SCAN (1 << 3)
+/* flag set for object allocated with physical address */
+#define OBJECT_PHYS (1 << 4)
#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
@@ -193,7 +197,9 @@ static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
-/* protecting the access to object_list and object_tree_root */
+/* search tree for object (with OBJECT_PHYS flag) boundaries */
+static struct rb_root object_phys_tree_root = RB_ROOT;
+/* protecting the access to object_list, object_tree_root (or object_phys_tree_root) */
static DEFINE_RAW_SPINLOCK(kmemleak_lock);
/* allocation caches for kmemleak internal data */
@@ -285,6 +291,9 @@ static void hex_dump_object(struct seq_file *seq,
const u8 *ptr = (const u8 *)object->pointer;
size_t len;
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ return;
+
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
@@ -378,9 +387,11 @@ static void dump_object_info(struct kmemleak_object *object)
* beginning of the memory block are allowed. The kmemleak_lock must be held
* when calling this function.
*/
-static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__lookup_object(unsigned long ptr, int alias,
+ bool is_phys)
{
- struct rb_node *rb = object_tree_root.rb_node;
+ struct rb_node *rb = is_phys ? object_phys_tree_root.rb_node :
+ object_tree_root.rb_node;
unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
while (rb) {
@@ -406,6 +417,12 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
return NULL;
}
+/* Look-up a kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+ return __lookup_object(ptr, alias, false);
+}
+
/*
* Increment the object use_count. Return 1 if successful or 0 otherwise. Note
* that once an object's use_count reached 0, the RCU freeing was already
@@ -515,14 +532,15 @@ static void put_object(struct kmemleak_object *object)
/*
* Look up an object in the object search tree and increase its use_count.
*/
-static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+static struct kmemleak_object *__find_and_get_object(unsigned long ptr, int alias,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object;
rcu_read_lock();
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = lookup_object(ptr, alias);
+ object = __lookup_object(ptr, alias, is_phys);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
/* check whether the object is still available */
@@ -533,28 +551,39 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
return object;
}
+/* Look up and get an object which allocated with virtual address. */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+ return __find_and_get_object(ptr, alias, false);
+}
+
/*
- * Remove an object from the object_tree_root and object_list. Must be called
- * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ * Remove an object from the object_tree_root (or object_phys_tree_root)
+ * and object_list. Must be called with the kmemleak_lock held _if_ kmemleak
+ * is still enabled.
*/
static void __remove_object(struct kmemleak_object *object)
{
- rb_erase(&object->rb_node, &object_tree_root);
+ rb_erase(&object->rb_node, object->flags & OBJECT_PHYS ?
+ &object_phys_tree_root :
+ &object_tree_root);
list_del_rcu(&object->object_list);
}
/*
* Look up an object in the object search tree and remove it from both
- * object_tree_root and object_list. The returned object's use_count should be
- * at least 1, as initially set by create_object().
+ * object_tree_root (or object_phys_tree_root) and object_list. The
+ * returned object's use_count should be at least 1, as initially set
+ * by create_object().
*/
-static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias)
+static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int alias,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object;
raw_spin_lock_irqsave(&kmemleak_lock, flags);
- object = lookup_object(ptr, alias);
+ object = __lookup_object(ptr, alias, is_phys);
if (object)
__remove_object(object);
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
@@ -572,10 +601,12 @@ static int __save_stack_trace(unsigned long *trace)
/*
* Create the metadata (struct kmemleak_object) corresponding to an allocated
- * memory block and add it to the object_list and object_tree_root.
+ * memory block and add it to the object_list and object_tree_root (or
+ * object_phys_tree_root).
*/
-static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
- int min_count, gfp_t gfp)
+static struct kmemleak_object *__create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp,
+ bool is_phys)
{
unsigned long flags;
struct kmemleak_object *object, *parent;
@@ -595,7 +626,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
INIT_HLIST_HEAD(&object->area_list);
raw_spin_lock_init(&object->lock);
atomic_set(&object->use_count, 1);
- object->flags = OBJECT_ALLOCATED;
+ object->flags = OBJECT_ALLOCATED | (is_phys ? OBJECT_PHYS : 0);
object->pointer = ptr;
object->size = kfence_ksize((void *)ptr) ?: size;
object->excess_ref = 0;
@@ -628,9 +659,16 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
raw_spin_lock_irqsave(&kmemleak_lock, flags);
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
- min_addr = min(min_addr, untagged_ptr);
- max_addr = max(max_addr, untagged_ptr + size);
- link = &object_tree_root.rb_node;
+ /*
+ * Only update min_addr and max_addr with object
+ * storing virtual address.
+ */
+ if (!is_phys) {
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
+ }
+ link = is_phys ? &object_phys_tree_root.rb_node :
+ &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
rb_parent = *link;
@@ -654,7 +692,8 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
}
}
rb_link_node(&object->rb_node, rb_parent, link);
- rb_insert_color(&object->rb_node, &object_tree_root);
+ rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root :
+ &object_tree_root);
list_add_tail_rcu(&object->object_list, &object_list);
out:
@@ -662,6 +701,20 @@ out:
return object;
}
+/* Create kmemleak object which allocated with virtual address. */
+static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ return __create_object(ptr, size, min_count, gfp, false);
+}
+
+/* Create kmemleak object which allocated with physical address. */
+static struct kmemleak_object *create_object_phys(unsigned long ptr, size_t size,
+ int min_count, gfp_t gfp)
+{
+ return __create_object(ptr, size, min_count, gfp, true);
+}
+
/*
* Mark the object as not allocated and schedule RCU freeing via put_object().
*/
@@ -690,7 +743,7 @@ static void delete_object_full(unsigned long ptr)
{
struct kmemleak_object *object;
- object = find_and_remove_object(ptr, 0);
+ object = find_and_remove_object(ptr, 0, false);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Freeing unknown object at 0x%08lx\n",
@@ -706,12 +759,12 @@ static void delete_object_full(unsigned long ptr)
* delete it. If the memory block is partially freed, the function may create
* additional metadata for the remaining parts of the block.
*/
-static void delete_object_part(unsigned long ptr, size_t size)
+static void delete_object_part(unsigned long ptr, size_t size, bool is_phys)
{
struct kmemleak_object *object;
unsigned long start, end;
- object = find_and_remove_object(ptr, 1);
+ object = find_and_remove_object(ptr, 1, is_phys);
if (!object) {
#ifdef DEBUG
kmemleak_warn("Partially freeing unknown object at 0x%08lx (size %zu)\n",
@@ -728,11 +781,11 @@ static void delete_object_part(unsigned long ptr, size_t size)
start = object->pointer;
end = object->pointer + object->size;
if (ptr > start)
- create_object(start, ptr - start, object->min_count,
- GFP_KERNEL);
+ __create_object(start, ptr - start, object->min_count,
+ GFP_KERNEL, is_phys);
if (ptr + size < end)
- create_object(ptr + size, end - ptr - size, object->min_count,
- GFP_KERNEL);
+ __create_object(ptr + size, end - ptr - size, object->min_count,
+ GFP_KERNEL, is_phys);
__delete_object(object);
}
@@ -753,11 +806,11 @@ static void paint_it(struct kmemleak_object *object, int color)
raw_spin_unlock_irqrestore(&object->lock, flags);
}
-static void paint_ptr(unsigned long ptr, int color)
+static void paint_ptr(unsigned long ptr, int color, bool is_phys)
{
struct kmemleak_object *object;
- object = find_and_get_object(ptr, 0);
+ object = __find_and_get_object(ptr, 0, is_phys);
if (!object) {
kmemleak_warn("Trying to color unknown object at 0x%08lx as %s\n",
ptr,
@@ -775,16 +828,16 @@ static void paint_ptr(unsigned long ptr, int color)
*/
static void make_gray_object(unsigned long ptr)
{
- paint_ptr(ptr, KMEMLEAK_GREY);
+ paint_ptr(ptr, KMEMLEAK_GREY, false);
}
/*
* Mark the object as black-colored so that it is ignored from scans and
* reporting.
*/
-static void make_black_object(unsigned long ptr)
+static void make_black_object(unsigned long ptr, bool is_phys)
{
- paint_ptr(ptr, KMEMLEAK_BLACK);
+ paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
}
/*
@@ -990,7 +1043,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- delete_object_part((unsigned long)ptr, size);
+ delete_object_part((unsigned long)ptr, size, false);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -1078,7 +1131,7 @@ void __ref kmemleak_ignore(const void *ptr)
pr_debug("%s(0x%p)\n", __func__, ptr);
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- make_black_object((unsigned long)ptr);
+ make_black_object((unsigned long)ptr, false);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1125,15 +1178,18 @@ EXPORT_SYMBOL(kmemleak_no_scan);
* address argument
* @phys: physical address of the object
* @size: size of the object
- * @min_count: minimum number of references to this object.
- * See kmemleak_alloc()
* @gfp: kmalloc() flags used for kmemleak internal memory allocations
*/
-void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, int min_count,
- gfp_t gfp)
+void __ref kmemleak_alloc_phys(phys_addr_t phys, size_t size, gfp_t gfp)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_alloc(__va(phys), size, min_count, gfp);
+ pr_debug("%s(0x%pa, %zu)\n", __func__, &phys, size);
+
+ if (kmemleak_enabled)
+ /*
+ * Create object with OBJECT_PHYS flag and
+ * assume min_count 0.
+ */
+ create_object_phys((unsigned long)phys, size, 0, gfp);
}
EXPORT_SYMBOL(kmemleak_alloc_phys);
@@ -1146,22 +1202,12 @@ EXPORT_SYMBOL(kmemleak_alloc_phys);
*/
void __ref kmemleak_free_part_phys(phys_addr_t phys, size_t size)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_free_part(__va(phys), size);
-}
-EXPORT_SYMBOL(kmemleak_free_part_phys);
+ pr_debug("%s(0x%pa)\n", __func__, &phys);
-/**
- * kmemleak_not_leak_phys - similar to kmemleak_not_leak but taking a physical
- * address argument
- * @phys: physical address of the object
- */
-void __ref kmemleak_not_leak_phys(phys_addr_t phys)
-{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_not_leak(__va(phys));
+ if (kmemleak_enabled)
+ delete_object_part((unsigned long)phys, size, true);
}
-EXPORT_SYMBOL(kmemleak_not_leak_phys);
+EXPORT_SYMBOL(kmemleak_free_part_phys);
/**
* kmemleak_ignore_phys - similar to kmemleak_ignore but taking a physical
@@ -1170,8 +1216,10 @@ EXPORT_SYMBOL(kmemleak_not_leak_phys);
*/
void __ref kmemleak_ignore_phys(phys_addr_t phys)
{
- if (PHYS_PFN(phys) >= min_low_pfn && PHYS_PFN(phys) < max_low_pfn)
- kmemleak_ignore(__va(phys));
+ pr_debug("%s(0x%pa)\n", __func__, &phys);
+
+ if (kmemleak_enabled)
+ make_black_object((unsigned long)phys, true);
}
EXPORT_SYMBOL(kmemleak_ignore_phys);
@@ -1182,6 +1230,9 @@ static bool update_checksum(struct kmemleak_object *object)
{
u32 old_csum = object->checksum;
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
+ return false;
+
kasan_disable_current();
kcsan_disable_current();
object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
@@ -1335,6 +1386,7 @@ static void scan_object(struct kmemleak_object *object)
{
struct kmemleak_scan_area *area;
unsigned long flags;
+ void *obj_ptr;
/*
* Once the object->lock is acquired, the corresponding memory block
@@ -1346,10 +1398,15 @@ static void scan_object(struct kmemleak_object *object)
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
+
+ obj_ptr = object->flags & OBJECT_PHYS ?
+ __va((phys_addr_t)object->pointer) :
+ (void *)object->pointer;
+
if (hlist_empty(&object->area_list) ||
object->flags & OBJECT_FULL_SCAN) {
- void *start = (void *)object->pointer;
- void *end = (void *)(object->pointer + object->size);
+ void *start = obj_ptr;
+ void *end = obj_ptr + object->size;
void *next;
do {
@@ -1413,18 +1470,21 @@ static void scan_gray_list(void)
*/
static void kmemleak_scan(void)
{
- unsigned long flags;
struct kmemleak_object *object;
struct zone *zone;
int __maybe_unused i;
int new_leaks = 0;
+ int loop1_cnt = 0;
jiffies_last_scan = jiffies;
/* prepare the kmemleak_object's */
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ bool obj_pinned = false;
+
+ loop1_cnt++;
+ raw_spin_lock_irq(&object->lock);
#ifdef DEBUG
/*
* With a few exceptions there should be a maximum of
@@ -1436,12 +1496,45 @@ static void kmemleak_scan(void)
dump_object_info(object);
}
#endif
+
+ /* ignore objects outside lowmem (paint them black) */
+ if ((object->flags & OBJECT_PHYS) &&
+ !(object->flags & OBJECT_NO_SCAN)) {
+ unsigned long phys = object->pointer;
+
+ if (PHYS_PFN(phys) < min_low_pfn ||
+ PHYS_PFN(phys + object->size) >= max_low_pfn)
+ __paint_it(object, KMEMLEAK_BLACK);
+ }
+
/* reset the reference count (whiten the object) */
object->count = 0;
- if (color_gray(object) && get_object(object))
+ if (color_gray(object) && get_object(object)) {
list_add_tail(&object->gray_list, &gray_list);
+ obj_pinned = true;
+ }
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
+
+ /*
+ * Do a cond_resched() to avoid soft lockup every 64k objects.
+ * Make sure a reference has been taken so that the object
+ * won't go away without RCU read lock.
+ */
+ if (!(loop1_cnt & 0xffff)) {
+ if (!obj_pinned && !get_object(object)) {
+ /* Try the next object instead */
+ loop1_cnt--;
+ continue;
+ }
+
+ rcu_read_unlock();
+ cond_resched();
+ rcu_read_lock();
+
+ if (!obj_pinned)
+ put_object(object);
+ }
}
rcu_read_unlock();
@@ -1509,14 +1602,21 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ /*
+ * This is racy but we can save the overhead of lock/unlock
+ * calls. The missed objects, if any, should be caught in
+ * the next scan.
+ */
+ if (!color_white(object))
+ continue;
+ raw_spin_lock_irq(&object->lock);
if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
&& update_checksum(object) && get_object(object)) {
/* color it gray temporarily */
object->count = object->min_count;
list_add_tail(&object->gray_list, &gray_list);
}
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
@@ -1536,7 +1636,14 @@ static void kmemleak_scan(void)
*/
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ /*
+ * This is racy but we can save the overhead of lock/unlock
+ * calls. The missed objects, if any, should be caught in
+ * the next scan.
+ */
+ if (!color_white(object))
+ continue;
+ raw_spin_lock_irq(&object->lock);
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
@@ -1546,7 +1653,7 @@ static void kmemleak_scan(void)
new_leaks++;
}
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
@@ -1748,15 +1855,14 @@ static int dump_str_object_info(const char *str)
static void kmemleak_clear(void)
{
struct kmemleak_object *object;
- unsigned long flags;
rcu_read_lock();
list_for_each_entry_rcu(object, &object_list, object_list) {
- raw_spin_lock_irqsave(&object->lock, flags);
+ raw_spin_lock_irq(&object->lock);
if ((object->flags & OBJECT_REPORTED) &&
unreferenced_object(object))
__paint_it(object, KMEMLEAK_GREY);
- raw_spin_unlock_irqrestore(&object->lock, flags);
+ raw_spin_unlock_irq(&object->lock);
}
rcu_read_unlock();
diff --git a/mm/ksm.c b/mm/ksm.c
index 54f78c9eecae..42ab153335a2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
cond_resched();
page = follow_page(vma, addr,
FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
- if (IS_ERR_OR_NULL(page))
+ if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
goto out;
page = follow_page(vma, addr, FOLL_GET);
- if (IS_ERR_OR_NULL(page))
+ if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
goto out;
if (PageAnon(page)) {
flush_anon_page(vma, page, addr);
@@ -712,7 +712,7 @@ again:
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
* the same is in reuse_ksm_page() case; but if page is swapcache
- * in migrate_page_move_mapping(), it might still be our page,
+ * in folio_migrate_mapping(), it might still be our page,
* in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
@@ -1083,7 +1083,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are downgrading page table to read
* only not changing it to point to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
/*
@@ -1186,7 +1186,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
* No need to notify as we are replacing a read only page with another
* read only page with the same content.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
ptep_clear_flush(vma, addr, ptep);
set_pte_at_notify(mm, addr, ptep, newpte);
@@ -2308,7 +2308,7 @@ next_mm:
if (ksm_test_exit(mm))
break;
*page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (IS_ERR_OR_NULL(*page)) {
+ if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
ksm_scan.address += PAGE_SIZE;
cond_resched();
continue;
diff --git a/mm/list_lru.c b/mm/list_lru.c
index ba76428ceece..a05e5bef3b40 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -71,7 +71,7 @@ list_lru_from_kmem(struct list_lru *lru, int nid, void *ptr,
if (!list_lru_memcg_aware(lru))
goto out;
- memcg = mem_cgroup_from_obj(ptr);
+ memcg = mem_cgroup_from_slab_obj(ptr);
if (!memcg)
goto out;
diff --git a/mm/madvise.c b/mm/madvise.c
index d7b4f2602949..5f0f0948a50e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -195,7 +195,6 @@ success:
static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
unsigned long end, struct mm_walk *walk)
{
- pte_t *orig_pte;
struct vm_area_struct *vma = walk->private;
unsigned long index;
struct swap_iocb *splug = NULL;
@@ -208,12 +207,13 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
swp_entry_t entry;
struct page *page;
spinlock_t *ptl;
+ pte_t *ptep;
- orig_pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl);
- pte = *(orig_pte + ((index - start) / PAGE_SIZE));
- pte_unmap_unlock(orig_pte, ptl);
+ ptep = pte_offset_map_lock(vma->vm_mm, pmd, index, &ptl);
+ pte = *ptep;
+ pte_unmap_unlock(ptep, ptl);
- if (pte_present(pte) || pte_none(pte))
+ if (!is_swap_pte(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
@@ -421,7 +421,7 @@ regular_page:
continue;
page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
@@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
}
page = vm_normal_page(vma, addr, ptent);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
@@ -1112,7 +1112,7 @@ static int madvise_inject_error(int behavior,
} else {
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
- ret = memory_failure(pfn, MF_COUNT_INCREASED);
+ ret = memory_failure(pfn, MF_COUNT_INCREASED | MF_SW_SIMULATED);
if (ret == -EOPNOTSUPP)
ret = 0;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index e4f03a6e8e56..b5d3026979fc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -29,6 +29,10 @@
# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
#endif
+#ifndef INIT_MEMBLOCK_MEMORY_REGIONS
+#define INIT_MEMBLOCK_MEMORY_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
/**
* DOC: memblock overview
*
@@ -55,9 +59,9 @@
* the allocator metadata. The "memory" and "reserved" types are nicely
* wrapped with struct memblock. This structure is statically
* initialized at build time. The region arrays are initially sized to
- * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
- * for "reserved". The region array for "physmem" is initially sized to
- * %INIT_PHYSMEM_REGIONS.
+ * %INIT_MEMBLOCK_MEMORY_REGIONS for "memory" and
+ * %INIT_MEMBLOCK_RESERVED_REGIONS for "reserved". The region array
+ * for "physmem" is initially sized to %INIT_PHYSMEM_REGIONS.
* The memblock_allow_resize() enables automatic resizing of the region
* arrays during addition of new regions. This feature should be used
* with care so that memory allocated for the region array will not
@@ -102,7 +106,7 @@ unsigned long min_low_pfn;
unsigned long max_pfn;
unsigned long long max_possible_pfn;
-static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_MEMORY_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
@@ -111,7 +115,7 @@ static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS
struct memblock memblock __initdata_memblock = {
.memory.regions = memblock_memory_init_regions,
.memory.cnt = 1, /* empty dummy entry */
- .memory.max = INIT_MEMBLOCK_REGIONS,
+ .memory.max = INIT_MEMBLOCK_MEMORY_REGIONS,
.memory.name = "memory",
.reserved.regions = memblock_reserved_init_regions,
@@ -327,7 +331,7 @@ again:
NUMA_NO_NODE, flags);
if (!ret && (flags & MEMBLOCK_MIRROR)) {
- pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
flags &= ~MEMBLOCK_MIRROR;
goto again;
@@ -593,6 +597,17 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
type->total_size = size;
return 0;
}
+
+ /*
+ * The worst case is when new range overlaps all existing regions,
+ * then we'll need type->cnt + 1 empty regions in @type. So if
+ * type->cnt * 2 + 1 is less than type->max, we know
+ * that there is enough empty regions in @type, and we can insert
+ * regions directly.
+ */
+ if (type->cnt * 2 + 1 < type->max)
+ insert = true;
+
repeat:
/*
* The following is executed twice. Once with %false @insert and
@@ -924,6 +939,9 @@ int __init_memblock memblock_clear_hotplug(phys_addr_t base, phys_addr_t size)
*/
int __init_memblock memblock_mark_mirror(phys_addr_t base, phys_addr_t size)
{
+ if (!mirrored_kernelcore)
+ return 0;
+
system_has_some_mirror = true;
return memblock_setclr_flag(base, size, 1, MEMBLOCK_MIRROR);
@@ -1345,8 +1363,8 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
* from the regions with mirroring enabled and then retried from any
* memory region.
*
- * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
- * allocated boot memory block, so that it is never reported as leaks.
+ * In addition, function using kmemleak_alloc_phys for allocated boot
+ * memory block, it is never reported as leaks.
*
* Return:
* Physical address of allocated memory block on success, %0 on failure.
@@ -1384,7 +1402,7 @@ again:
if (flags & MEMBLOCK_MIRROR) {
flags &= ~MEMBLOCK_MIRROR;
- pr_warn("Could not allocate %pap bytes of mirrored memory\n",
+ pr_warn_ratelimited("Could not allocate %pap bytes of mirrored memory\n",
&size);
goto again;
}
@@ -1398,12 +1416,12 @@ done:
*/
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
- * The min_count is set to 0 so that memblock allocated
- * blocks are never reported as leaks. This is because many
- * of these blocks are only referred via the physical
- * address which is not looked up by kmemleak.
+ * Memblock allocated blocks are never reported as
+ * leaks. This is because many of these blocks are
+ * only referred via the physical address which is
+ * not looked up by kmemleak.
*/
- kmemleak_alloc_phys(found, size, 0, 0);
+ kmemleak_alloc_phys(found, size, 0);
return found;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index abec50f31fe6..b69979c9ced5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -626,7 +626,14 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
x = __this_cpu_add_return(stats_updates, abs(val));
if (x > MEMCG_CHARGE_BATCH) {
- atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+ /*
+ * If stats_flush_threshold exceeds the threshold
+ * (>num_online_cpus()), cgroup stats update will be triggered
+ * in __mem_cgroup_flush_stats(). Increasing this var further
+ * is redundant and simply adds overhead in atomic update.
+ */
+ if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
+ atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
__this_cpu_write(stats_updates, 0);
}
}
@@ -783,7 +790,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
struct lruvec *lruvec;
rcu_read_lock();
- memcg = mem_cgroup_from_obj(p);
+ memcg = mem_cgroup_from_slab_obj(p);
/*
* Untracked pages have no memcg, no lruvec. Update only the
@@ -1460,14 +1467,35 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
}
-static char *memory_stat_format(struct mem_cgroup *memcg)
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+ PGSCAN_KSWAPD,
+ PGSCAN_DIRECT,
+ PGSTEAL_KSWAPD,
+ PGSTEAL_DIRECT,
+ PGFAULT,
+ PGMAJFAULT,
+ PGREFILL,
+ PGACTIVATE,
+ PGDEACTIVATE,
+ PGLAZYFREE,
+ PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+ ZSWPIN,
+ ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ THP_FAULT_ALLOC,
+ THP_COLLAPSE_ALLOC,
+#endif
+};
+
+static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
{
struct seq_buf s;
int i;
- seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
- if (!s.buffer)
- return NULL;
+ seq_buf_init(&s, buf, bufsize);
/*
* Provide statistics on the state of the memory subsystem as
@@ -1495,46 +1523,20 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
}
/* Accumulated memory events */
-
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
- memcg_events(memcg, PGFAULT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
- memcg_events(memcg, PGMAJFAULT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGREFILL),
- memcg_events(memcg, PGREFILL));
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
memcg_events(memcg, PGSCAN_DIRECT));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
memcg_events(memcg, PGSTEAL_DIRECT));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
- memcg_events(memcg, PGACTIVATE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
- memcg_events(memcg, PGDEACTIVATE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
- memcg_events(memcg, PGLAZYFREE));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
- memcg_events(memcg, PGLAZYFREED));
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
- memcg_events(memcg, ZSWPIN));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
- memcg_events(memcg, ZSWPOUT));
-#endif
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
- memcg_events(memcg, THP_FAULT_ALLOC));
- seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
- memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
+ seq_buf_printf(&s, "%s %lu\n",
+ vm_event_name(memcg_vm_event_stat[i]),
+ memcg_events(memcg, memcg_vm_event_stat[i]));
/* The above should easily fit into one page */
WARN_ON_ONCE(seq_buf_has_overflowed(&s));
-
- return s.buffer;
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1570,7 +1572,10 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
*/
void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
{
- char *buf;
+ /* Use static buffer, for the caller is holding oom_lock. */
+ static char buf[PAGE_SIZE];
+
+ lockdep_assert_held(&oom_lock);
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
@@ -1591,11 +1596,8 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
pr_info("Memory cgroup stats for ");
pr_cont_cgroup_path(memcg->css.cgroup);
pr_cont(":");
- buf = memory_stat_format(memcg);
- if (!buf)
- return;
+ memory_stat_format(memcg, buf, sizeof(buf));
pr_info("%s", buf);
- kfree(buf);
}
/*
@@ -2331,7 +2333,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
- gfp_mask, true);
+ gfp_mask,
+ MEMCG_RECLAIM_MAY_SWAP);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2576,8 +2579,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
struct page_counter *counter;
unsigned long nr_reclaimed;
bool passed_oom = false;
- bool may_swap = true;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
bool drained = false;
+ bool raised_max_event = false;
unsigned long pflags;
retry:
@@ -2593,7 +2597,7 @@ retry:
mem_over_limit = mem_cgroup_from_counter(counter, memory);
} else {
mem_over_limit = mem_cgroup_from_counter(counter, memsw);
- may_swap = false;
+ reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
}
if (batch > nr_pages) {
@@ -2617,10 +2621,11 @@ retry:
goto nomem;
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ raised_max_event = true;
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, may_swap);
+ gfp_mask, reclaim_options);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2684,6 +2689,13 @@ nomem:
return -ENOMEM;
force:
/*
+ * If the allocation has to be enforced, don't forget to raise
+ * a MEMCG_MAX event.
+ */
+ if (!raised_max_event)
+ memcg_memory_event(mem_over_limit, MEMCG_MAX);
+
+ /*
* The allocation either can't fail or will lead to more memory
* being freed very soon. Allow memory usage go over the limit
* temporarily by force charging it.
@@ -2842,27 +2854,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
return 0;
}
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object or a generic kernel page, so
- * different mechanisms for getting the memory cgroup pointer should be used.
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
{
- struct folio *folio;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- folio = virt_to_folio(p);
-
/*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
@@ -2895,6 +2889,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
return page_memcg_check(folio_page(folio, 0));
}
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+ struct folio *folio;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ if (unlikely(is_vmalloc_addr(p)))
+ folio = page_folio(vmalloc_to_page(p));
+ else
+ folio = virt_to_folio(p);
+
+ return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
+
static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg = NULL;
@@ -3402,8 +3443,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
continue;
}
- if (!try_to_free_mem_cgroup_pages(memcg, 1,
- GFP_KERNEL, !memsw)) {
+ if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
ret = -EBUSY;
break;
}
@@ -3513,7 +3554,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
if (signal_pending(current))
return -EINTR;
- if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
+ if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+ MEMCG_RECLAIM_MAY_SWAP))
nr_retries--;
}
@@ -3625,7 +3667,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg;
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return 0;
if (unlikely(mem_cgroup_is_root(memcg)))
@@ -3649,7 +3691,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
{
struct mem_cgroup *parent;
- if (cgroup_memory_nokmem)
+ if (mem_cgroup_kmem_disabled())
return;
if (unlikely(mem_cgroup_is_root(memcg)))
@@ -4859,7 +4901,7 @@ static int mem_cgroup_slab_show(struct seq_file *m, void *p)
{
/*
* Deprecated.
- * Please, take a look at tools/cgroup/slabinfo.py .
+ * Please, take a look at tools/cgroup/memcg_slabinfo.py .
*/
return 0;
}
@@ -5060,6 +5102,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
return idr_find(&mem_cgroup_idr, id);
}
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+ struct cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg;
+
+ cgrp = cgroup_get_from_id(ino);
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+ if (css)
+ memcg = container_of(css, struct mem_cgroup, css);
+ else
+ memcg = ERR_PTR(-ENOENT);
+
+ cgroup_put(cgrp);
+
+ return memcg;
+}
+#endif
+
static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -5665,8 +5730,8 @@ out:
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and
+ * thus not on the lru.
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -5704,7 +5769,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
*/
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page))
+ if (is_device_private_page(page) ||
+ is_device_coherent_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -6241,7 +6307,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, true);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
if (!reclaimed && !nr_retries--)
break;
@@ -6290,7 +6356,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, true))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
nr_reclaims--;
continue;
}
@@ -6335,11 +6401,11 @@ static int memory_events_local_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- char *buf;
+ char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
- buf = memory_stat_format(memcg);
if (!buf)
return -ENOMEM;
+ memory_stat_format(memcg, buf, PAGE_SIZE);
seq_puts(m, buf);
kfree(buf);
return 0;
@@ -6419,6 +6485,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
+ unsigned int reclaim_options;
int err;
buf = strstrip(buf);
@@ -6426,6 +6493,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
if (err)
return err;
+ reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@@ -6442,7 +6510,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, true);
+ GFP_KERNEL, reclaim_options);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b85661cbdc4a..9a7a228ad04a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -33,6 +33,9 @@
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/
+
+#define pr_fmt(fmt) "Memory failure: " fmt
+
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
@@ -69,6 +72,8 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
+static bool hw_memory_failure __read_mostly = false;
+
static bool __page_handle_poison(struct page *page)
{
int ret;
@@ -250,7 +255,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
short addr_lsb = tk->size_shift;
int ret = 0;
- pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
+ pr_err("%#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
if ((flags & MF_ACTION_REQUIRED) && (t == current))
@@ -268,7 +273,7 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
ret = send_sig_mceerr(BUS_MCEERR_AO, (void __user *)tk->addr,
addr_lsb, t); /* synchronous? */
if (ret < 0)
- pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
+ pr_info("Error sending signal to %s:%d: %d\n",
t->comm, t->pid, ret);
return ret;
}
@@ -295,10 +300,9 @@ void shake_page(struct page *p)
}
EXPORT_SYMBOL_GPL(shake_page);
-static unsigned long dev_pagemap_mapping_shift(struct page *page,
- struct vm_area_struct *vma)
+static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
+ unsigned long address)
{
- unsigned long address = vma_address(page, vma);
unsigned long ret = 0;
pgd_t *pgd;
p4d_t *p4d;
@@ -338,23 +342,33 @@ static unsigned long dev_pagemap_mapping_shift(struct page *page,
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
+ *
+ * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
+ * In other cases, such as anonymous and file-backend page, the address to be
+ * killed can be caculated by @p itself.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
- struct vm_area_struct *vma,
- struct list_head *to_kill)
+ pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
+ struct list_head *to_kill)
{
struct to_kill *tk;
tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
if (!tk) {
- pr_err("Memory failure: Out of memory while machine check handling\n");
+ pr_err("Out of memory while machine check handling\n");
return;
}
tk->addr = page_address_in_vma(p, vma);
- if (is_zone_device_page(p))
- tk->size_shift = dev_pagemap_mapping_shift(p, vma);
- else
+ if (is_zone_device_page(p)) {
+ /*
+ * Since page->mapping is not used for fsdax, we need
+ * calculate the address based on the vma.
+ */
+ if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
+ tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+ } else
tk->size_shift = page_shift(compound_head(p));
/*
@@ -368,7 +382,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* has a mapping for the page.
*/
if (tk->addr == -EFAULT) {
- pr_info("Memory failure: Unable to find user space address %lx in %s\n",
+ pr_info("Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
} else if (tk->size_shift == 0) {
kfree(tk);
@@ -401,7 +415,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* signal and then access the memory. Just kill it.
*/
if (fail || tk->addr == -EFAULT) {
- pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+ pr_err("%#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
tk->tsk, PIDTYPE_PID);
@@ -414,7 +428,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail,
* process anyways.
*/
else if (kill_proc(tk, pfn, flags) < 0)
- pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
+ pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
}
put_task_struct(tk->tsk);
@@ -503,7 +517,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -539,13 +553,41 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, vma, to_kill);
+ add_to_kill(t, page, 0, vma, to_kill);
}
}
read_unlock(&tasklist_lock);
i_mmap_unlock_read(mapping);
}
+#ifdef CONFIG_FS_DAX
+/*
+ * Collect processes when the error hit a fsdax page.
+ */
+static void collect_procs_fsdax(struct page *page,
+ struct address_space *mapping, pgoff_t pgoff,
+ struct list_head *to_kill)
+{
+ struct vm_area_struct *vma;
+ struct task_struct *tsk;
+
+ i_mmap_lock_read(mapping);
+ read_lock(&tasklist_lock);
+ for_each_process(tsk) {
+ struct task_struct *t = task_early_kill(tsk, true);
+
+ if (!t)
+ continue;
+ vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+ if (vma->vm_mm == t->mm)
+ add_to_kill(t, page, pgoff, vma, to_kill);
+ }
+ }
+ read_unlock(&tasklist_lock);
+ i_mmap_unlock_read(mapping);
+}
+#endif /* CONFIG_FS_DAX */
+
/*
* Collect the processes who have the corrupted page mapped to kill.
*/
@@ -777,12 +819,10 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
int err = mapping->a_ops->error_remove_page(mapping, p);
if (err != 0) {
- pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
- pfn, err);
+ pr_info("%#lx: Failed to punch page: %d\n", pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_info("Memory failure: %#lx: failed to release buffers\n",
- pfn);
+ pr_info("%#lx: failed to release buffers\n", pfn);
} else {
ret = MF_RECOVERED;
}
@@ -794,8 +834,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
if (invalidate_inode_page(p))
ret = MF_RECOVERED;
else
- pr_info("Memory failure: %#lx: Failed to invalidate\n",
- pfn);
+ pr_info("%#lx: Failed to invalidate\n", pfn);
}
return ret;
@@ -825,7 +864,7 @@ static bool has_extra_refcount(struct page_state *ps, struct page *p,
count -= 1;
if (count > 0) {
- pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+ pr_err("%#lx: %s still referenced by %d users\n",
page_to_pfn(p), action_page_types[ps->type], count);
return true;
}
@@ -849,7 +888,7 @@ static int me_kernel(struct page_state *ps, struct page *p)
*/
static int me_unknown(struct page_state *ps, struct page *p)
{
- pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
+ pr_err("%#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@@ -1005,12 +1044,13 @@ static int me_swapcache_dirty(struct page_state *ps, struct page *p)
static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
+ struct folio *folio = page_folio(p);
int ret;
- delete_from_swap_cache(p);
+ delete_from_swap_cache(folio);
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
- unlock_page(p);
+ folio_unlock(folio);
if (has_extra_refcount(ps, p, false))
ret = MF_FAILED;
@@ -1133,7 +1173,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
trace_memory_failure_event(pfn, type, result);
num_poisoned_pages_inc();
- pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
+ pr_err("%#lx: recovery action for %s: %s\n",
pfn, action_page_types[type], action_name[result]);
}
@@ -1208,8 +1248,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags)
if (head == compound_head(page))
return 1;
- pr_info("Memory failure: %#lx cannot catch tail\n",
- page_to_pfn(page));
+ pr_info("%#lx cannot catch tail\n", page_to_pfn(page));
put_page(head);
}
@@ -1272,7 +1311,7 @@ try_again:
}
out:
if (ret == -EIO)
- pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
+ pr_err("%#lx: unhandlable page.\n", page_to_pfn(p));
return ret;
}
@@ -1371,13 +1410,12 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
return true;
if (PageKsm(p)) {
- pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
+ pr_err("%#lx: can't handle KSM pages.\n", pfn);
return false;
}
if (PageSwapCache(p)) {
- pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
- pfn);
+ pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
ttu |= TTU_IGNORE_HWPOISON;
}
@@ -1395,7 +1433,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
- pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
+ pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
pfn);
}
}
@@ -1424,14 +1462,14 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
i_mmap_unlock_write(mapping);
} else
- pr_info("Memory failure: %#lx: could not lock mapping for mapped huge page\n", pfn);
+ pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
} else {
try_to_unmap(folio, ttu);
}
unmap_success = !page_mapped(hpage);
if (!unmap_success)
- pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
+ pr_err("%#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
/*
@@ -1496,6 +1534,134 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
return 0;
}
+static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
+ struct address_space *mapping, pgoff_t index, int flags)
+{
+ struct to_kill *tk;
+ unsigned long size = 0;
+
+ list_for_each_entry(tk, to_kill, nd)
+ if (tk->size_shift)
+ size = max(size, 1UL << tk->size_shift);
+
+ if (size) {
+ /*
+ * Unmap the largest mapping to avoid breaking up device-dax
+ * mappings which are constant size. The actual size of the
+ * mapping being torn down is communicated in siginfo, see
+ * kill_proc()
+ */
+ loff_t start = (index << PAGE_SHIFT) & ~(size - 1);
+
+ unmap_mapping_range(mapping, start, size, 0);
+ }
+
+ kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags);
+}
+
+static int mf_generic_kill_procs(unsigned long long pfn, int flags,
+ struct dev_pagemap *pgmap)
+{
+ struct page *page = pfn_to_page(pfn);
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ int rc = 0;
+
+ /*
+ * Pages instantiated by device-dax (not filesystem-dax)
+ * may be compound pages.
+ */
+ page = compound_head(page);
+
+ /*
+ * Prevent the inode from being freed while we are interrogating
+ * the address_space, typically this would be handled by
+ * lock_page(), but dax pages do not use the page lock. This
+ * also prevents changes to the mapping of this pfn until
+ * poison signaling is complete.
+ */
+ cookie = dax_lock_page(page);
+ if (!cookie)
+ return -EBUSY;
+
+ if (hwpoison_filter(page)) {
+ rc = -EOPNOTSUPP;
+ goto unlock;
+ }
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ case MEMORY_DEVICE_COHERENT:
+ /*
+ * TODO: Handle device pages which may need coordination
+ * with device-side memory.
+ */
+ rc = -ENXIO;
+ goto unlock;
+ default:
+ break;
+ }
+
+ /*
+ * Use this flag as an indication that the dax page has been
+ * remapped UC to prevent speculative consumption of poison.
+ */
+ SetPageHWPoison(page);
+
+ /*
+ * Unlike System-RAM there is no possibility to swap in a
+ * different physical page at a given virtual address, so all
+ * userspace consumption of ZONE_DEVICE memory necessitates
+ * SIGBUS (i.e. MF_MUST_KILL)
+ */
+ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+ collect_procs(page, &to_kill, true);
+
+ unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags);
+unlock:
+ dax_unlock_page(page, cookie);
+ return rc;
+}
+
+#ifdef CONFIG_FS_DAX
+/**
+ * mf_dax_kill_procs - Collect and kill processes who are using this file range
+ * @mapping: address_space of the file in use
+ * @index: start pgoff of the range within the file
+ * @count: length of the range, in unit of PAGE_SIZE
+ * @mf_flags: memory failure flags
+ */
+int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
+ unsigned long count, int mf_flags)
+{
+ LIST_HEAD(to_kill);
+ dax_entry_t cookie;
+ struct page *page;
+ size_t end = index + count;
+
+ mf_flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+
+ for (; index < end; index++) {
+ page = NULL;
+ cookie = dax_lock_mapping_entry(mapping, index, &page);
+ if (!cookie)
+ return -EBUSY;
+ if (!page)
+ goto unlock;
+
+ SetPageHWPoison(page);
+
+ collect_procs_fsdax(page, mapping, index, &to_kill);
+ unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
+ index, mf_flags);
+unlock:
+ dax_unlock_mapping_entry(mapping, index, cookie);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mf_dax_kill_procs);
+#endif /* CONFIG_FS_DAX */
+
/*
* Called from hugetlb code with hugetlb_lock held.
*
@@ -1564,7 +1730,7 @@ retry:
*hugetlb = 0;
return 0;
} else if (res == -EHWPOISON) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+ pr_err("%#lx: already hardware poisoned\n", pfn);
if (flags & MF_ACTION_REQUIRED) {
head = compound_head(p);
res = kill_accessing_process(current, page_to_pfn(head), flags);
@@ -1631,23 +1797,20 @@ out:
unlock_page(head);
return res;
}
+
#else
static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
{
return 0;
}
-#endif
+
+#endif /* CONFIG_HUGETLB_PAGE */
static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
struct dev_pagemap *pgmap)
{
struct page *page = pfn_to_page(pfn);
- unsigned long size = 0;
- struct to_kill *tk;
- LIST_HEAD(tokill);
- int rc = -EBUSY;
- loff_t start;
- dax_entry_t cookie;
+ int rc = -ENXIO;
if (flags & MF_COUNT_INCREASED)
/*
@@ -1656,73 +1819,24 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
put_page(page);
/* device metadata space is not recoverable */
- if (!pgmap_pfn_valid(pgmap, pfn)) {
- rc = -ENXIO;
+ if (!pgmap_pfn_valid(pgmap, pfn))
goto out;
- }
-
- /*
- * Pages instantiated by device-dax (not filesystem-dax)
- * may be compound pages.
- */
- page = compound_head(page);
/*
- * Prevent the inode from being freed while we are interrogating
- * the address_space, typically this would be handled by
- * lock_page(), but dax pages do not use the page lock. This
- * also prevents changes to the mapping of this pfn until
- * poison signaling is complete.
+ * Call driver's implementation to handle the memory failure, otherwise
+ * fall back to generic handler.
*/
- cookie = dax_lock_page(page);
- if (!cookie)
- goto out;
-
- if (hwpoison_filter(page)) {
- rc = -EOPNOTSUPP;
- goto unlock;
- }
-
- if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ if (pgmap->ops->memory_failure) {
+ rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
- * TODO: Handle HMM pages which may need coordination
- * with device-side memory.
+ * Fall back to generic handler too if operation is not
+ * supported inside the driver/device/filesystem.
*/
- goto unlock;
+ if (rc != -EOPNOTSUPP)
+ goto out;
}
- /*
- * Use this flag as an indication that the dax page has been
- * remapped UC to prevent speculative consumption of poison.
- */
- SetPageHWPoison(page);
-
- /*
- * Unlike System-RAM there is no possibility to swap in a
- * different physical page at a given virtual address, so all
- * userspace consumption of ZONE_DEVICE memory necessitates
- * SIGBUS (i.e. MF_MUST_KILL)
- */
- flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
- collect_procs(page, &tokill, true);
-
- list_for_each_entry(tk, &tokill, nd)
- if (tk->size_shift)
- size = max(size, 1UL << tk->size_shift);
- if (size) {
- /*
- * Unmap the largest mapping to avoid breaking up
- * device-dax mappings which are constant size. The
- * actual size of the mapping being torn down is
- * communicated in siginfo, see kill_proc()
- */
- start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, size, 0);
- }
- kill_procs(&tokill, true, false, pfn, flags);
- rc = 0;
-unlock:
- dax_unlock_page(page, cookie);
+ rc = mf_generic_kill_procs(pfn, flags, pgmap);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
@@ -1768,6 +1882,9 @@ int memory_failure(unsigned long pfn, int flags)
mutex_lock(&mf_mutex);
+ if (!(flags & MF_SW_SIMULATED))
+ hw_memory_failure = true;
+
p = pfn_to_online_page(pfn);
if (!p) {
res = arch_memory_failure(pfn, flags);
@@ -1782,8 +1899,7 @@ int memory_failure(unsigned long pfn, int flags)
goto unlock_mutex;
}
}
- pr_err("Memory failure: %#lx: memory outside kernel control\n",
- pfn);
+ pr_err("%#lx: memory outside kernel control\n", pfn);
res = -ENXIO;
goto unlock_mutex;
}
@@ -1794,8 +1910,7 @@ try_again:
goto unlock_mutex;
if (TestSetPageHWPoison(p)) {
- pr_err("Memory failure: %#lx: already hardware poisoned\n",
- pfn);
+ pr_err("%#lx: already hardware poisoned\n", pfn);
res = -EHWPOISON;
if (flags & MF_ACTION_REQUIRED)
res = kill_accessing_process(current, pfn, flags);
@@ -1935,7 +2050,7 @@ try_again:
/*
* Now take care of user space mappings.
- * Abort on fail: __delete_from_page_cache() assumes unmapped page.
+ * Abort on fail: __filemap_remove_folio() assumes unmapped page.
*/
if (!hwpoison_user_mappings(p, pfn, flags, p)) {
action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
@@ -2011,7 +2126,7 @@ void memory_failure_queue(unsigned long pfn, int flags)
if (kfifo_put(&mf_cpu->fifo, entry))
schedule_work_on(smp_processor_id(), &mf_cpu->work);
else
- pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
+ pr_err("buffer overflow when queuing memory failure at %#lx\n",
pfn);
spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
put_cpu_var(memory_failure_cpu);
@@ -2068,6 +2183,8 @@ static int __init memory_failure_init(void)
}
core_initcall(memory_failure_init);
+#undef pr_fmt
+#define pr_fmt(fmt) "" fmt
#define unpoison_pr_info(fmt, pfn, rs) \
({ \
if (__ratelimit(rs)) \
@@ -2103,6 +2220,13 @@ int unpoison_memory(unsigned long pfn)
mutex_lock(&mf_mutex);
+ if (hw_memory_failure) {
+ unpoison_pr_info("Unpoison: Disabled after HW memory failure %#lx\n",
+ pfn, &unpoison_rs);
+ ret = -EOPNOTSUPP;
+ goto unlock_mutex;
+ }
+
if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
@@ -2166,7 +2290,7 @@ static bool isolate_page(struct page *page, struct list_head *pagelist)
bool lru = PageLRU(page);
if (PageHuge(page)) {
- isolated = isolate_huge_page(page, pagelist);
+ isolated = !isolate_hugetlb(page, pagelist);
} else {
if (lru)
isolated = !isolate_lru_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 7a089145cad4..4ba73f5aa8bb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -624,6 +624,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (is_zero_pfn(pfn))
return NULL;
if (pte_devmap(pte))
+ /*
+ * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
+ * and will have refcounts incremented on their struct pages
+ * when they are inserted into PTEs, thus they are safe to
+ * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
+ * do not have refcounts. Example of legacy ZONE_DEVICE is
+ * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+ */
return NULL;
print_bad_pte(vma, addr, pte, NULL);
@@ -736,7 +744,7 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
* Currently device exclusive access only supports anonymous
* memory so the entry shouldn't point to a filebacked page.
*/
- WARN_ON_ONCE(!PageAnon(page));
+ WARN_ON_ONCE(1);
set_pte_at(vma->vm_mm, address, ptep, pte);
@@ -1245,7 +1253,7 @@ vma_needs_copy(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
if (userfaultfd_wp(dst_vma))
return true;
- if (src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP))
+ if (src_vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return true;
if (src_vma->anon_vma)
@@ -3020,7 +3028,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
balance_dirty_pages_ratelimited(mapping);
if (fpin) {
fput(fpin);
- return VM_FAULT_RETRY;
+ return VM_FAULT_COMPLETED;
}
}
@@ -3043,7 +3051,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
pte_t entry;
VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
- VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page));
+ VM_BUG_ON(page && PageAnon(page) && !PageAnonExclusive(page));
/*
* Clear the pages cpupid information as the existing
@@ -4369,9 +4377,12 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- /* See comment in handle_pte_fault() */
+ /*
+ * See comment in handle_pte_fault() for how this scenario happens, we
+ * need to return NOPAGE so that we drop this page.
+ */
if (pmd_devmap_trans_unstable(vmf->pmd))
- return 0;
+ return VM_FAULT_NOPAGE;
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
@@ -4431,10 +4442,6 @@ late_initcall(fault_around_debugfs);
* It uses vm_ops->map_pages() to map the pages, which skips the page if it's
* not ready to be mapped: not up-to-date, locked, etc.
*
- * This function is called with the page table lock taken. In the split ptlock
- * case the page table lock only protects only those entries which belong to
- * the page table corresponding to the fault address.
- *
* This function doesn't cross the VMA boundaries, in order to call map_pages()
* only once.
*
@@ -4693,7 +4700,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
pte = pte_modify(old_pte, vma->vm_page_prot);
page = vm_normal_page(vma, vmf->address, pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
goto out_map;
/* TODO: handle PTE-mapped THP */
@@ -4802,6 +4809,19 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
+static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
goto split;
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
@@ -4812,19 +4832,7 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf)
split:
/* COW or write-notify not handled on PUD level: split pud.*/
__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- return VM_FAULT_FALLBACK;
-}
-
-static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
- return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
@@ -4962,6 +4970,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
.gfp_mask = __get_fault_gfp_mask(vma),
};
struct mm_struct *mm = vma->vm_mm;
+ unsigned long vm_flags = vma->vm_flags;
pgd_t *pgd;
p4d_t *p4d;
vm_fault_t ret;
@@ -4975,7 +4984,8 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
if (!vmf.pud)
return VM_FAULT_OOM;
retry_pud:
- if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
+ if (pud_none(*vmf.pud) &&
+ hugepage_vma_check(vma, vm_flags, false, true)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -5008,7 +5018,8 @@ retry_pud:
if (pud_trans_unstable(vmf.pud))
goto retry_pud;
- if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
+ if (pmd_none(*vmf.pmd) &&
+ hugepage_vma_check(vma, vm_flags, false, true)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 1213d0c67a53..fad6d1f2262a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -43,30 +43,22 @@
#include "shuffle.h"
#ifdef CONFIG_MHP_MEMMAP_ON_MEMORY
-static int memmap_on_memory_set(const char *val, const struct kernel_param *kp)
-{
- if (hugetlb_optimize_vmemmap_enabled())
- return 0;
- return param_set_bool(val, kp);
-}
-
-static const struct kernel_param_ops memmap_on_memory_ops = {
- .flags = KERNEL_PARAM_OPS_FL_NOARG,
- .set = memmap_on_memory_set,
- .get = param_get_bool,
-};
-
/*
* memory_hotplug.memmap_on_memory parameter
*/
static bool memmap_on_memory __ro_after_init;
-module_param_cb(memmap_on_memory, &memmap_on_memory_ops, &memmap_on_memory, 0444);
+module_param(memmap_on_memory, bool, 0444);
MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug");
-bool mhp_memmap_on_memory(void)
+static inline bool mhp_memmap_on_memory(void)
{
return memmap_on_memory;
}
+#else
+static inline bool mhp_memmap_on_memory(void)
+{
+ return false;
+}
#endif
enum {
@@ -237,8 +229,7 @@ static void release_memory_resource(struct resource *res)
kfree(res);
}
-static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
- const char *reason)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages)
{
/*
* Disallow all operations smaller than a sub-section and only
@@ -255,12 +246,8 @@ static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
min_align = PAGES_PER_SUBSECTION;
else
min_align = PAGES_PER_SECTION;
- if (!IS_ALIGNED(pfn, min_align)
- || !IS_ALIGNED(nr_pages, min_align)) {
- WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
- reason, pfn, pfn + nr_pages - 1);
+ if (!IS_ALIGNED(pfn | nr_pages, min_align))
return -EINVAL;
- }
return 0;
}
@@ -337,9 +324,10 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
altmap->alloc = 0;
}
- err = check_pfn_span(pfn, nr_pages, "add");
- if (err)
- return err;
+ if (check_pfn_span(pfn, nr_pages)) {
+ WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
for (; pfn < end_pfn; pfn += cur_nr_pages) {
/* Select all remaining pages up to the next section boundary */
@@ -536,8 +524,10 @@ void __remove_pages(unsigned long pfn, unsigned long nr_pages,
map_offset = vmem_altmap_offset(altmap);
- if (check_pfn_span(pfn, nr_pages, "remove"))
+ if (check_pfn_span(pfn, nr_pages)) {
+ WARN(1, "Misaligned %s start: %#lx end: #%lx\n", __func__, pfn, pfn + nr_pages - 1);
return;
+ }
for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
@@ -672,12 +662,18 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon
}
+#ifdef CONFIG_ZONE_DEVICE
static void section_taint_zone_device(unsigned long pfn)
{
struct mem_section *ms = __pfn_to_section(pfn);
ms->section_mem_map |= SECTION_TAINT_ZONE_DEVICE;
}
+#else
+static inline void section_taint_zone_device(unsigned long pfn)
+{
+}
+#endif
/*
* Associate the pfn range with the given zone, initializing the memmaps
@@ -936,7 +932,7 @@ static struct zone *auto_movable_zone_for_pfn(int nid,
if (!page)
continue;
/* If anything is !MOVABLE online the rest !MOVABLE. */
- if (page_zonenum(page) != ZONE_MOVABLE)
+ if (!is_zone_movable_page(page))
goto kernel_zone;
online_pages += PAGES_PER_SECTION;
}
@@ -1031,7 +1027,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
struct zone *zone)
{
unsigned long end_pfn = pfn + nr_pages;
- int ret;
+ int ret, i;
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
if (ret)
@@ -1039,6 +1035,9 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+ for (i = 0; i < nr_pages; i++)
+ SetPageVmemmapSelfHosted(pfn_to_page(pfn + i));
+
/*
* It might be that the vmemmap_pages fully span sections. If that is
* the case, mark those sections online here as otherwise they will be
@@ -1643,7 +1642,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (PageHuge(page)) {
pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_huge_page(head, &source);
+ isolate_hugetlb(head, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index d39b01fd52fe..b73d3248d976 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -465,9 +465,8 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
}
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
- spin_unlock(ptl);
walk->action = ACTION_CONTINUE;
- goto out;
+ goto unlock;
}
if (!queue_pages_required(page, qp))
goto unlock;
@@ -484,7 +483,6 @@ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
ret = -EIO;
unlock:
spin_unlock(ptl);
-out:
return ret;
}
@@ -523,7 +521,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
/*
* vm_normal_page() filters out zero pages, but there might
@@ -602,7 +600,7 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
- if (!isolate_huge_page(page, qp->pagelist) &&
+ if (isolate_hugetlb(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT))
/*
* Failed to isolate page but allow migrating pages
@@ -1388,7 +1386,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
unsigned long bits = min_t(unsigned long, maxnode, BITS_PER_LONG);
unsigned long t;
- if (get_bitmap(&t, &nmask[maxnode / BITS_PER_LONG], bits))
+ if (get_bitmap(&t, &nmask[(maxnode - 1) / BITS_PER_LONG], bits))
return -EFAULT;
if (maxnode - bits >= MAX_NUMNODES) {
diff --git a/mm/mempool.c b/mm/mempool.c
index b933d0fc21b8..96488b13a1ef 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -379,7 +379,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
gfp_t gfp_temp;
VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO);
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */
gfp_mask |= __GFP_NORETRY; /* don't loop in __alloc_pages */
diff --git a/mm/memremap.c b/mm/memremap.c
index b870a659eee6..58b20c3c300b 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -141,10 +141,10 @@ void memunmap_pages(struct dev_pagemap *pgmap)
for (i = 0; i < pgmap->nr_range; i++)
percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i));
wait_for_completion(&pgmap->done);
- percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
pageunmap_range(pgmap, i);
+ percpu_ref_exit(&pgmap->ref);
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
devmap_managed_enable_put(pgmap);
@@ -279,8 +279,8 @@ err_pfn_remap:
/*
- * Not device managed version of dev_memremap_pages, undone by
- * memunmap_pages(). Please use dev_memremap_pages if you have a struct
+ * Not device managed version of devm_memremap_pages, undone by
+ * memunmap_pages(). Please use devm_memremap_pages if you have a struct
* device available.
*/
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
@@ -315,6 +315,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(-EINVAL);
}
break;
+ case MEMORY_DEVICE_COHERENT:
+ if (!pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->owner) {
+ WARN(1, "Missing owner\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
case MEMORY_DEVICE_FS_DAX:
if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
WARN(1, "File system DAX not supported\n");
@@ -499,7 +509,7 @@ void free_zone_device_page(struct page *page)
}
#ifdef CONFIG_FS_DAX
-bool __put_devmap_managed_page(struct page *page)
+bool __put_devmap_managed_page_refs(struct page *page, int refs)
{
if (page->pgmap->type != MEMORY_DEVICE_FS_DAX)
return false;
@@ -509,9 +519,9 @@ bool __put_devmap_managed_page(struct page *page)
* refcount is 1, then the page is free and the refcount is
* stable because nobody holds a reference on the page.
*/
- if (page_ref_dec_return(page) == 1)
+ if (page_ref_sub_return(page, refs) == 1)
wake_up_var(&page->_refcount);
return true;
}
-EXPORT_SYMBOL(__put_devmap_managed_page);
+EXPORT_SYMBOL(__put_devmap_managed_page_refs);
#endif /* CONFIG_FS_DAX */
diff --git a/mm/migrate.c b/mm/migrate.c
index e51588e95f57..6a1597c92261 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -59,7 +59,7 @@
int isolate_movable_page(struct page *page, isolate_mode_t mode)
{
- struct address_space *mapping;
+ const struct movable_operations *mops;
/*
* Avoid burning cycles with pages that are yet under __free_pages(),
@@ -97,10 +97,10 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
if (!PageMovable(page) || PageIsolated(page))
goto out_no_isolated;
- mapping = page_mapping(page);
- VM_BUG_ON_PAGE(!mapping, page);
+ mops = page_movable_ops(page);
+ VM_BUG_ON_PAGE(!mops, page);
- if (!mapping->a_ops->isolate_page(page, mode))
+ if (!mops->isolate_page(page, mode))
goto out_no_isolated;
/* Driver shouldn't use PG_isolated bit of page->flags */
@@ -120,10 +120,9 @@ out:
static void putback_movable_page(struct page *page)
{
- struct address_space *mapping;
+ const struct movable_operations *mops = page_movable_ops(page);
- mapping = page_mapping(page);
- mapping->a_ops->putback_page(page);
+ mops->putback_page(page);
ClearPageIsolated(page);
}
@@ -133,7 +132,7 @@ static void putback_movable_page(struct page *page)
*
* This function shall be used whenever the isolated pageset has been
* built from lru, balloon, hugetlbfs page. See isolate_migratepages_range()
- * and isolate_huge_page().
+ * and isolate_hugetlb().
*/
void putback_movable_pages(struct list_head *l)
{
@@ -315,12 +314,27 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
__migration_entry_wait(mm, ptep, ptl);
}
-void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte)
+#ifdef CONFIG_HUGETLB_PAGE
+void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+{
+ pte_t pte;
+
+ spin_lock(ptl);
+ pte = huge_ptep_get(ptep);
+
+ if (unlikely(!is_hugetlb_entry_migration(pte)))
+ spin_unlock(ptl);
+ else
+ migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
+}
+
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
- __migration_entry_wait(mm, pte, ptl);
+ spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
+
+ __migration_entry_wait_huge(pte, ptl);
}
+#endif
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
@@ -337,13 +351,18 @@ unlock:
}
#endif
-static int expected_page_refs(struct address_space *mapping, struct page *page)
+static int folio_expected_refs(struct address_space *mapping,
+ struct folio *folio)
{
- int expected_count = 1;
+ int refs = 1;
+ if (!mapping)
+ return refs;
+
+ refs += folio_nr_pages(folio);
+ if (folio_test_private(folio))
+ refs++;
- if (mapping)
- expected_count += compound_nr(page) + page_has_private(page);
- return expected_count;
+ return refs;
}
/*
@@ -360,7 +379,7 @@ int folio_migrate_mapping(struct address_space *mapping,
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = expected_page_refs(mapping, &folio->page) + extra_count;
+ int expected_count = folio_expected_refs(mapping, folio) + extra_count;
long nr = folio_nr_pages(folio);
if (!mapping) {
@@ -470,26 +489,26 @@ EXPORT_SYMBOL(folio_migrate_mapping);
* of folio_migrate_mapping().
*/
int migrate_huge_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page)
+ struct folio *dst, struct folio *src)
{
- XA_STATE(xas, &mapping->i_pages, page_index(page));
+ XA_STATE(xas, &mapping->i_pages, folio_index(src));
int expected_count;
xas_lock_irq(&xas);
- expected_count = 2 + page_has_private(page);
- if (!page_ref_freeze(page, expected_count)) {
+ expected_count = 2 + folio_has_private(src);
+ if (!folio_ref_freeze(src, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
}
- newpage->index = page->index;
- newpage->mapping = page->mapping;
+ dst->index = src->index;
+ dst->mapping = src->mapping;
- get_page(newpage);
+ folio_get(dst);
- xas_store(&xas, newpage);
+ xas_store(&xas, dst);
- page_ref_unfreeze(page, expected_count - 1);
+ folio_ref_unfreeze(src, expected_count - 1);
xas_unlock_irq(&xas);
@@ -589,34 +608,37 @@ EXPORT_SYMBOL(folio_migrate_copy);
* Migration functions
***********************************************************/
-/*
- * Common logic to directly migrate a single LRU page suitable for
- * pages that do not use PagePrivate/PagePrivate2.
+/**
+ * migrate_folio() - Simple folio migration.
+ * @mapping: The address_space containing the folio.
+ * @dst: The folio to migrate the data to.
+ * @src: The folio containing the current data.
+ * @mode: How to migrate the page.
+ *
+ * Common logic to directly migrate a single LRU folio suitable for
+ * folios that do not use PagePrivate/PagePrivate2.
*
- * Pages are locked upon entry and exit.
+ * Folios are locked upon entry and exit.
*/
-int migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+int migrate_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
{
- struct folio *newfolio = page_folio(newpage);
- struct folio *folio = page_folio(page);
int rc;
- BUG_ON(folio_test_writeback(folio)); /* Writeback must be complete */
+ BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */
- rc = folio_migrate_mapping(mapping, newfolio, folio, 0);
+ rc = folio_migrate_mapping(mapping, dst, src, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
if (mode != MIGRATE_SYNC_NO_COPY)
- folio_migrate_copy(newfolio, folio);
+ folio_migrate_copy(dst, src);
else
- folio_migrate_flags(newfolio, folio);
+ folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
-EXPORT_SYMBOL(migrate_page);
+EXPORT_SYMBOL(migrate_folio);
#ifdef CONFIG_BLOCK
/* Returns true if all buffers are successfully locked */
@@ -657,23 +679,23 @@ static bool buffer_migrate_lock_buffers(struct buffer_head *head,
return true;
}
-static int __buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode,
+static int __buffer_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode,
bool check_refs)
{
struct buffer_head *bh, *head;
int rc;
int expected_count;
- if (!page_has_buffers(page))
- return migrate_page(mapping, newpage, page, mode);
+ head = folio_buffers(src);
+ if (!head)
+ return migrate_folio(mapping, dst, src, mode);
/* Check whether page does not have extra refs before we do more work */
- expected_count = expected_page_refs(mapping, page);
- if (page_count(page) != expected_count)
+ expected_count = folio_expected_refs(mapping, src);
+ if (folio_ref_count(src) != expected_count)
return -EAGAIN;
- head = page_buffers(page);
if (!buffer_migrate_lock_buffers(head, mode))
return -EAGAIN;
@@ -704,23 +726,22 @@ recheck_buffers:
}
}
- rc = migrate_page_move_mapping(mapping, newpage, page, 0);
+ rc = folio_migrate_mapping(mapping, dst, src, 0);
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
- attach_page_private(newpage, detach_page_private(page));
+ folio_attach_private(dst, folio_detach_private(src));
bh = head;
do {
- set_bh_page(bh, newpage, bh_offset(bh));
+ set_bh_page(bh, &dst->page, bh_offset(bh));
bh = bh->b_this_page;
-
} while (bh != head);
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(dst, src);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
rc = MIGRATEPAGE_SUCCESS;
unlock_buffers:
@@ -730,43 +751,79 @@ unlock_buffers:
do {
unlock_buffer(bh);
bh = bh->b_this_page;
-
} while (bh != head);
return rc;
}
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist. For example attached buffer heads are accessed only under page lock.
+/**
+ * buffer_migrate_folio() - Migration function for folios with buffers.
+ * @mapping: The address space containing @src.
+ * @dst: The folio to migrate to.
+ * @src: The folio to migrate from.
+ * @mode: How to migrate the folio.
+ *
+ * This function can only be used if the underlying filesystem guarantees
+ * that no other references to @src exist. For example attached buffer
+ * heads are accessed only under the folio lock. If your filesystem cannot
+ * provide this guarantee, buffer_migrate_folio_norefs() may be more
+ * appropriate.
+ *
+ * Return: 0 on success or a negative errno on failure.
*/
-int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+int buffer_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- return __buffer_migrate_page(mapping, newpage, page, mode, false);
+ return __buffer_migrate_folio(mapping, dst, src, mode, false);
}
-EXPORT_SYMBOL(buffer_migrate_page);
+EXPORT_SYMBOL(buffer_migrate_folio);
-/*
- * Same as above except that this variant is more careful and checks that there
- * are also no buffer head references. This function is the right one for
- * mappings where buffer heads are directly looked up and referenced (such as
- * block device mappings).
+/**
+ * buffer_migrate_folio_norefs() - Migration function for folios with buffers.
+ * @mapping: The address space containing @src.
+ * @dst: The folio to migrate to.
+ * @src: The folio to migrate from.
+ * @mode: How to migrate the folio.
+ *
+ * Like buffer_migrate_folio() except that this variant is more careful
+ * and checks that there are also no buffer head references. This function
+ * is the right one for mappings where buffer heads are directly looked
+ * up and referenced (such as block device mappings).
+ *
+ * Return: 0 on success or a negative errno on failure.
*/
-int buffer_migrate_page_norefs(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+int buffer_migrate_folio_norefs(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- return __buffer_migrate_page(mapping, newpage, page, mode, true);
+ return __buffer_migrate_folio(mapping, dst, src, mode, true);
}
#endif
+int filemap_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
+{
+ int ret;
+
+ ret = folio_migrate_mapping(mapping, dst, src, 0);
+ if (ret != MIGRATEPAGE_SUCCESS)
+ return ret;
+
+ if (folio_get_private(src))
+ folio_attach_private(dst, folio_detach_private(src));
+
+ if (mode != MIGRATE_SYNC_NO_COPY)
+ folio_migrate_copy(dst, src);
+ else
+ folio_migrate_flags(dst, src);
+ return MIGRATEPAGE_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(filemap_migrate_folio);
+
/*
- * Writeback a page to clean the dirty state
+ * Writeback a folio to clean the dirty state
*/
-static int writeout(struct address_space *mapping, struct page *page)
+static int writeout(struct address_space *mapping, struct folio *folio)
{
- struct folio *folio = page_folio(page);
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = 1,
@@ -780,25 +837,25 @@ static int writeout(struct address_space *mapping, struct page *page)
/* No write method for the address space */
return -EINVAL;
- if (!clear_page_dirty_for_io(page))
+ if (!folio_clear_dirty_for_io(folio))
/* Someone else already triggered a write */
return -EAGAIN;
/*
- * A dirty page may imply that the underlying filesystem has
- * the page on some queue. So the page must be clean for
- * migration. Writeout may mean we loose the lock and the
- * page state is no longer what we checked for earlier.
+ * A dirty folio may imply that the underlying filesystem has
+ * the folio on some queue. So the folio must be clean for
+ * migration. Writeout may mean we lose the lock and the
+ * folio state is no longer what we checked for earlier.
* At this point we know that the migration attempt cannot
* be successful.
*/
remove_migration_ptes(folio, folio, false);
- rc = mapping->a_ops->writepage(page, &wbc);
+ rc = mapping->a_ops->writepage(&folio->page, &wbc);
if (rc != AOP_WRITEPAGE_ACTIVATE)
/* unlocked. Relock */
- lock_page(page);
+ folio_lock(folio);
return (rc < 0) ? -EIO : -EAGAIN;
}
@@ -806,11 +863,11 @@ static int writeout(struct address_space *mapping, struct page *page)
/*
* Default handling if a filesystem does not provide a migration function.
*/
-static int fallback_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+static int fallback_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
- if (PageDirty(page)) {
- /* Only writeback pages in full synchronous migration */
+ if (folio_test_dirty(src)) {
+ /* Only writeback folios in full synchronous migration */
switch (mode) {
case MIGRATE_SYNC:
case MIGRATE_SYNC_NO_COPY:
@@ -818,18 +875,18 @@ static int fallback_migrate_page(struct address_space *mapping,
default:
return -EBUSY;
}
- return writeout(mapping, page);
+ return writeout(mapping, src);
}
/*
* Buffers may be managed in a filesystem specific way.
* We must have no buffers or drop them.
*/
- if (page_has_private(page) &&
- !try_to_release_page(page, GFP_KERNEL))
+ if (folio_test_private(src) &&
+ !filemap_release_folio(src, GFP_KERNEL))
return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
- return migrate_page(mapping, newpage, page, mode);
+ return migrate_folio(mapping, dst, src, mode);
}
/*
@@ -846,32 +903,32 @@ static int fallback_migrate_page(struct address_space *mapping,
static int move_to_new_folio(struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
- struct address_space *mapping;
int rc = -EAGAIN;
bool is_lru = !__PageMovable(&src->page);
VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
- mapping = folio_mapping(src);
-
if (likely(is_lru)) {
+ struct address_space *mapping = folio_mapping(src);
+
if (!mapping)
- rc = migrate_page(mapping, &dst->page, &src->page, mode);
- else if (mapping->a_ops->migratepage)
+ rc = migrate_folio(mapping, dst, src, mode);
+ else if (mapping->a_ops->migrate_folio)
/*
- * Most pages have a mapping and most filesystems
- * provide a migratepage callback. Anonymous pages
+ * Most folios have a mapping and most filesystems
+ * provide a migrate_folio callback. Anonymous folios
* are part of swap space which also has its own
- * migratepage callback. This is the most common path
+ * migrate_folio callback. This is the most common path
* for page migration.
*/
- rc = mapping->a_ops->migratepage(mapping, &dst->page,
- &src->page, mode);
+ rc = mapping->a_ops->migrate_folio(mapping, dst, src,
+ mode);
else
- rc = fallback_migrate_page(mapping, &dst->page,
- &src->page, mode);
+ rc = fallback_migrate_folio(mapping, dst, src, mode);
} else {
+ const struct movable_operations *mops;
+
/*
* In case of non-lru page, it could be released after
* isolation step. In that case, we shouldn't try migration.
@@ -883,8 +940,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
goto out;
}
- rc = mapping->a_ops->migratepage(mapping, &dst->page,
- &src->page, mode);
+ mops = page_movable_ops(&src->page);
+ rc = mops->migrate_page(&dst->page, &src->page, mode);
WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
!folio_test_isolated(src));
}
@@ -1090,15 +1147,10 @@ static int unmap_and_move(new_page_t get_new_page,
return -ENOSYS;
if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
+ /* Page was freed from under us. So we are done. */
ClearPageActive(page);
ClearPageUnevictable(page);
- if (unlikely(__PageMovable(page))) {
- lock_page(page);
- if (!PageMovable(page))
- ClearPageIsolated(page);
- unlock_page(page);
- }
+ /* free_pages_prepare() will clear PG_isolated. */
goto out;
}
@@ -1106,6 +1158,7 @@ static int unmap_and_move(new_page_t get_new_page,
if (!newpage)
return -ENOMEM;
+ newpage->private = 0;
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1619,7 +1672,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
goto out;
err = -ENOENT;
- if (!page)
+ if (!page || is_zone_device_page(page))
goto out;
err = 0;
@@ -1632,8 +1685,9 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
if (PageHuge(page)) {
if (PageHead(page)) {
- isolate_huge_page(page, pagelist);
- err = 1;
+ err = isolate_hugetlb(page, pagelist);
+ if (!err)
+ err = 1;
}
} else {
struct page *head;
@@ -1809,7 +1863,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
if (IS_ERR(page))
goto set_status;
- if (page) {
+ if (page && !is_zone_device_page(page)) {
err = page_to_nid(page);
put_page(page);
} else {
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 5052093d0262..27fb37d65476 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -148,15 +148,21 @@ again:
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
- if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
- goto next;
pfn = pte_pfn(pte);
- if (is_zero_pfn(pfn)) {
+ if (is_zero_pfn(pfn) &&
+ (migrate->flags & MIGRATE_VMA_SELECT_SYSTEM)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+ if (page && !is_zone_device_page(page) &&
+ !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
+ else if (page && is_device_coherent_page(page) &&
+ (!(migrate->flags & MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+ page->pgmap->owner != migrate->pgmap_owner))
+ goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
@@ -518,7 +524,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
* handle_pte_fault()
* do_anonymous_page()
* to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
- * private page.
+ * private or coherent page.
*/
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
@@ -594,11 +600,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
page_to_pfn(page));
entry = swp_entry_to_pte(swp_entry);
} else {
- /*
- * For now we only support migrating to un-addressable device
- * memory.
- */
- if (is_zone_device_page(page)) {
+ if (is_zone_device_page(page) &&
+ !is_device_coherent_page(page)) {
pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
goto abort;
}
@@ -683,6 +686,12 @@ void migrate_vma_pages(struct migrate_vma *migrate)
}
if (!page) {
+ /*
+ * The only time there is no vma is when called from
+ * migrate_device_coherent_page(). However this isn't
+ * called if the page could not be unmapped.
+ */
+ VM_BUG_ON(!migrate->vma);
if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
if (!notified) {
@@ -701,10 +710,11 @@ void migrate_vma_pages(struct migrate_vma *migrate)
mapping = page_mapping(page);
- if (is_device_private_page(newpage)) {
+ if (is_device_private_page(newpage) ||
+ is_device_coherent_page(newpage)) {
/*
- * For now only support private anonymous when migrating
- * to un-addressable device memory.
+ * For now only support anonymous memory migrating to
+ * device private or coherent memory.
*/
if (mapping) {
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -718,7 +728,8 @@ void migrate_vma_pages(struct migrate_vma *migrate)
continue;
}
- r = migrate_page(mapping, newpage, page, MIGRATE_SYNC_NO_COPY);
+ r = migrate_folio(mapping, page_folio(newpage),
+ page_folio(page), MIGRATE_SYNC_NO_COPY);
if (r != MIGRATEPAGE_SUCCESS)
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
}
@@ -790,3 +801,49 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
}
}
EXPORT_SYMBOL(migrate_vma_finalize);
+
+/*
+ * Migrate a device coherent page back to normal memory. The caller should have
+ * a reference on page which will be copied to the new page if migration is
+ * successful or dropped on failure.
+ */
+int migrate_device_coherent_page(struct page *page)
+{
+ unsigned long src_pfn, dst_pfn = 0;
+ struct migrate_vma args;
+ struct page *dpage;
+
+ WARN_ON_ONCE(PageCompound(page));
+
+ lock_page(page);
+ src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+ args.src = &src_pfn;
+ args.dst = &dst_pfn;
+ args.cpages = 1;
+ args.npages = 1;
+ args.vma = NULL;
+
+ /*
+ * We don't have a VMA and don't need to walk the page tables to find
+ * the source page. So call migrate_vma_unmap() directly to unmap the
+ * page as migrate_vma_setup() will fail if args.vma == NULL.
+ */
+ migrate_vma_unmap(&args);
+ if (!(src_pfn & MIGRATE_PFN_MIGRATE))
+ return -EBUSY;
+
+ dpage = alloc_page(GFP_USER | __GFP_NOWARN);
+ if (dpage) {
+ lock_page(dpage);
+ dst_pfn = migrate_pfn(page_to_pfn(dpage));
+ }
+
+ migrate_vma_pages(&args);
+ if (src_pfn & MIGRATE_PFN_MIGRATE)
+ copy_highpage(dpage, page);
+ migrate_vma_finalize(&args);
+
+ if (src_pfn & MIGRATE_PFN_MIGRATE)
+ return 0;
+ return -EBUSY;
+}
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..b14e929084cc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
- if (!page)
+ if (!page || is_zone_device_page(page))
continue;
if (PageTransCompound(page))
continue;
diff --git a/mm/mmap.c b/mm/mmap.c
index 61e6135c54ef..c035020d0c89 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -81,53 +81,6 @@ static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
unsigned long start, unsigned long end);
-/* description of effects of mapping type and prot in current implementation.
- * this is due to the limited x86 page protection hardware. The expected
- * behavior is in parens:
- *
- * map_type prot
- * PROT_NONE PROT_READ PROT_WRITE PROT_EXEC
- * MAP_SHARED r: (no) no r: (yes) yes r: (no) yes r: (no) yes
- * w: (no) no w: (no) no w: (yes) yes w: (no) no
- * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
- * w: (no) no w: (no) no w: (copy) copy w: (no) no
- * x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE (with Enhanced PAN supported):
- * r: (no) no
- * w: (no) no
- * x: (yes) yes
- */
-pgprot_t protection_map[16] __ro_after_init = {
- [VM_NONE] = __P000,
- [VM_READ] = __P001,
- [VM_WRITE] = __P010,
- [VM_WRITE | VM_READ] = __P011,
- [VM_EXEC] = __P100,
- [VM_EXEC | VM_READ] = __P101,
- [VM_EXEC | VM_WRITE] = __P110,
- [VM_EXEC | VM_WRITE | VM_READ] = __P111,
- [VM_SHARED] = __S000,
- [VM_SHARED | VM_READ] = __S001,
- [VM_SHARED | VM_WRITE] = __S010,
- [VM_SHARED | VM_WRITE | VM_READ] = __S011,
- [VM_SHARED | VM_EXEC] = __S100,
- [VM_SHARED | VM_EXEC | VM_READ] = __S101,
- [VM_SHARED | VM_EXEC | VM_WRITE] = __S110,
- [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = __S111
-};
-
-#ifndef CONFIG_ARCH_HAS_VM_GET_PAGE_PROT
-pgprot_t vm_get_page_prot(unsigned long vm_flags)
-{
- return protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
-}
-EXPORT_SYMBOL(vm_get_page_prot);
-#endif /* CONFIG_ARCH_HAS_VM_GET_PAGE_PROT */
-
static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
{
return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
@@ -1694,7 +1647,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
return 0;
/* Do we need to track softdirty? */
- if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ if (vma_soft_dirty_enabled(vma))
return 1;
/* Specialty mapping? */
@@ -1894,7 +1847,6 @@ unmap_and_free_vma:
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
free_vma:
@@ -2588,7 +2540,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- /* don't alter vm_end if the coredump is running */
if (!prev || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
@@ -2944,7 +2895,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
unsigned long ret = -EINVAL;
struct file *file;
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.rst.\n",
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/mm/remap_file_pages.rst.\n",
current->comm, current->pid);
if (prot)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ba5592655ee3..3a23dde73723 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -38,6 +38,39 @@
#include "internal.h"
+static inline bool can_change_pte_writable(struct vm_area_struct *vma,
+ unsigned long addr, pte_t pte)
+{
+ struct page *page;
+
+ VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte));
+
+ if (pte_protnone(pte) || !pte_dirty(pte))
+ return false;
+
+ /* Do we need write faults for softdirty tracking? */
+ if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
+ return false;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_pte_wp(vma, pte))
+ return false;
+
+ if (!(vma->vm_flags & VM_SHARED)) {
+ /*
+ * We can only special-case on exclusive anonymous pages,
+ * because we know that our write-fault handler similarly would
+ * map them writable without any additional checks while holding
+ * the PT lock.
+ */
+ page = vm_normal_page(vma, addr, pte);
+ if (!page || !PageAnon(page) || !PageAnonExclusive(page))
+ return false;
+ }
+
+ return true;
+}
+
static unsigned long change_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
unsigned long pages = 0;
int target_node = NUMA_NO_NODE;
- bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -95,7 +127,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
continue;
page = vm_normal_page(vma, addr, oldpte);
- if (!page || PageKsm(page))
+ if (!page || is_zone_device_page(page) || PageKsm(page))
continue;
/* Also skip shared copy-on-write pages */
@@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
ptent = pte_wrprotect(ptent);
ptent = pte_mkuffd_wp(ptent);
} else if (uffd_wp_resolve) {
- /*
- * Leave the write bit to be handled
- * by PF interrupt handler, then
- * things like COW could be properly
- * handled.
- */
ptent = pte_clear_uffd_wp(ptent);
}
- /* Avoid taking write faults for known dirty pages */
- if (dirty_accountable && pte_dirty(ptent) &&
- (pte_soft_dirty(ptent) ||
- !(vma->vm_flags & VM_SOFTDIRTY))) {
+ /*
+ * In some writable, shared mappings, we might want
+ * to catch actual write access -- see
+ * vma_wants_writenotify().
+ *
+ * In all writable, private mappings, we have to
+ * properly handle COW.
+ *
+ * In both cases, we can sometimes still change PTEs
+ * writable and avoid the write-fault handler, for
+ * example, if a PTE is already dirty and no other
+ * COW or special handling is required.
+ */
+ if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
+ !pte_write(ptent) &&
+ can_change_pte_writable(vma, addr, ptent))
ptent = pte_mkwrite(ptent);
- }
+
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
if (pte_needs_flush(oldpte, ptent))
tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
@@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
+ bool try_change_writable;
pgoff_t pgoff;
int error;
- int dirty_accountable = 0;
if (newflags == oldflags) {
*pprev = vma;
@@ -583,11 +621,20 @@ success:
* held in write mode.
*/
vma->vm_flags = newflags;
- dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
+ /*
+ * We want to check manually if we can change individual PTEs writable
+ * if we can't do that automatically for all PTEs in a mapping. For
+ * private mappings, that's always the case when we have write
+ * permissions as we properly have to handle COW.
+ */
+ if (vma->vm_flags & VM_SHARED)
+ try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
+ else
+ try_change_writable = !!(vma->vm_flags & VM_WRITE);
vma_set_page_prot(vma);
change_protection(tlb, vma, start, end, vma->vm_page_prot,
- dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
+ try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0);
/*
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major
@@ -616,7 +663,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
{
unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
- int error = -EINVAL;
+ int error;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
diff --git a/mm/nommu.c b/mm/nommu.c
index 9d7afc2d959e..e819cbc21b39 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -500,7 +500,7 @@ static void delete_nommu_region(struct vm_region *region)
static void free_page_series(unsigned long from, unsigned long to)
{
for (; from < to; from += PAGE_SIZE) {
- struct page *page = virt_to_page(from);
+ struct page *page = virt_to_page((void *)from);
atomic_long_dec(&mmap_pages_allocated);
put_page(page);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 55c2776ae699..d0d466a5c804 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout.
*/
-static void balance_dirty_pages(struct bdi_writeback *wb,
- unsigned long pages_dirtied)
+static int balance_dirty_pages(struct bdi_writeback *wb,
+ unsigned long pages_dirtied, unsigned int flags)
{
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
@@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
+ int ret = 0;
for (;;) {
unsigned long now = jiffies;
@@ -1628,6 +1629,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
}
/*
+ * In laptop mode, we wait until hitting the higher threshold
+ * before starting background writeout, and then write out all
+ * the way down to the lower threshold. So slow writers cause
+ * minimal disk activity.
+ *
+ * In normal mode, we start background writeout at the lower
+ * background_thresh, to keep the amount of dirty memory low.
+ */
+ if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
+ !writeback_in_progress(wb))
+ wb_start_background_writeback(wb);
+
+ /*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
* when the wb limits are ramping up in case of !strictlimit.
@@ -1657,6 +1671,7 @@ free_running:
break;
}
+ /* Start writeback even when in laptop mode */
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
@@ -1715,8 +1730,8 @@ free_running:
sdtc = mdtc;
}
- if (dirty_exceeded && !wb->dirty_exceeded)
- wb->dirty_exceeded = 1;
+ if (dirty_exceeded != wb->dirty_exceeded)
+ wb->dirty_exceeded = dirty_exceeded;
if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
BANDWIDTH_INTERVAL))
@@ -1789,6 +1804,10 @@ pause:
period,
pause,
start_time);
+ if (flags & BDP_ASYNC) {
+ ret = -EAGAIN;
+ break;
+ }
__set_current_state(TASK_KILLABLE);
wb->dirty_sleep = now;
io_schedule_timeout(pause);
@@ -1820,26 +1839,7 @@ pause:
if (fatal_signal_pending(current))
break;
}
-
- if (!dirty_exceeded && wb->dirty_exceeded)
- wb->dirty_exceeded = 0;
-
- if (writeback_in_progress(wb))
- return;
-
- /*
- * In laptop mode, we wait until hitting the higher threshold before
- * starting background writeout, and then write out all the way down
- * to the lower threshold. So slow writers cause minimal disk activity.
- *
- * In normal mode, we start background writeout at the lower
- * background_thresh, to keep the amount of dirty memory low.
- */
- if (laptop_mode)
- return;
-
- if (nr_reclaimable > gdtc->bg_thresh)
- wb_start_background_writeback(wb);
+ return ret;
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/**
- * balance_dirty_pages_ratelimited - balance dirty memory state
- * @mapping: address_space which was dirtied
+ * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
+ * @mapping: address_space which was dirtied.
+ * @flags: BDP flags.
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
- * Once we're over the dirty memory limit we decrease the ratelimiting
- * by a lot, to prevent individual processes from overshooting the limit
- * by (ratelimit_pages) each.
+ * See balance_dirty_pages_ratelimited() for details.
+ *
+ * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
+ * indicate that memory is out of balance and the caller must wait
+ * for I/O to complete. Otherwise, it will return 0 to indicate
+ * that either memory was already in balance, or it was able to sleep
+ * until the amount of dirty memory returned to balance.
*/
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
+ unsigned int flags)
{
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL;
int ratelimit;
+ int ret = 0;
int *p;
if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
- return;
+ return ret;
if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL);
@@ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit))
- balance_dirty_pages(wb, current->nr_dirtied);
+ ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
wb_put(wb);
+ return ret;
+}
+
+/**
+ * balance_dirty_pages_ratelimited - balance dirty memory state.
+ * @mapping: address_space which was dirtied.
+ *
+ * Processes which are dirtying memory should call in here once for each page
+ * which was newly dirtied. The function will periodically check the system's
+ * dirty state and will initiate writeback if needed.
+ *
+ * Once we're over the dirty memory limit we decrease the ratelimiting
+ * by a lot, to prevent individual processes from overshooting the limit
+ * by (ratelimit_pages) each.
+ */
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+ balance_dirty_pages_ratelimited_flags(mapping, 0);
}
EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e008a3df0485..e5486d47406e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -126,13 +126,97 @@ typedef int __bitwise fpi_t;
static DEFINE_MUTEX(pcp_batch_high_lock);
#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
-struct pagesets {
- local_lock_t lock;
-};
-static DEFINE_PER_CPU(struct pagesets, pagesets) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+/*
+ * On SMP, spin_trylock is sufficient protection.
+ * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP.
+ */
+#define pcp_trylock_prepare(flags) do { } while (0)
+#define pcp_trylock_finish(flag) do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */
+#define pcp_trylock_prepare(flags) local_irq_save(flags)
+#define pcp_trylock_finish(flags) local_irq_restore(flags)
+#endif
+
+/*
+ * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid
+ * a migration causing the wrong PCP to be locked and remote memory being
+ * potentially allocated, pin the task to the CPU for the lookup+lock.
+ * preempt_disable is used on !RT because it is faster than migrate_disable.
+ * migrate_disable is used on RT because otherwise RT spinlock usage is
+ * interfered with and a high priority task cannot preempt the allocator.
+ */
+#ifndef CONFIG_PREEMPT_RT
+#define pcpu_task_pin() preempt_disable()
+#define pcpu_task_unpin() preempt_enable()
+#else
+#define pcpu_task_pin() migrate_disable()
+#define pcpu_task_unpin() migrate_enable()
+#endif
+/*
+ * Generic helper to lookup and a per-cpu variable with an embedded spinlock.
+ * Return value should be used with equivalent unlock helper.
+ */
+#define pcpu_spin_lock(type, member, ptr) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock(&_ret->member); \
+ _ret; \
+})
+
+#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ spin_lock_irqsave(&_ret->member, flags); \
+ _ret; \
+})
+
+#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \
+({ \
+ type *_ret; \
+ pcpu_task_pin(); \
+ _ret = this_cpu_ptr(ptr); \
+ if (!spin_trylock_irqsave(&_ret->member, flags)) { \
+ pcpu_task_unpin(); \
+ _ret = NULL; \
+ } \
+ _ret; \
+})
+
+#define pcpu_spin_unlock(member, ptr) \
+({ \
+ spin_unlock(&ptr->member); \
+ pcpu_task_unpin(); \
+})
+
+#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \
+({ \
+ spin_unlock_irqrestore(&ptr->member, flags); \
+ pcpu_task_unpin(); \
+})
+
+/* struct per_cpu_pages specific helpers. */
+#define pcp_spin_lock(ptr) \
+ pcpu_spin_lock(struct per_cpu_pages, lock, ptr)
+
+#define pcp_spin_lock_irqsave(ptr, flags) \
+ pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_trylock_irqsave(ptr, flags) \
+ pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags)
+
+#define pcp_spin_unlock(ptr) \
+ pcpu_spin_unlock(lock, ptr)
+
+#define pcp_spin_unlock_irqrestore(ptr, flags) \
+ pcpu_spin_unlock_irqrestore(lock, ptr, flags)
#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
DEFINE_PER_CPU(int, numa_node);
EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -151,13 +235,7 @@ DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */
EXPORT_PER_CPU_SYMBOL(_numa_mem_);
#endif
-/* work_structs for global per-cpu drains */
-struct pcpu_drain {
- struct zone *zone;
- struct work_struct work;
-};
static DEFINE_MUTEX(pcpu_drain_mutex);
-static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -356,7 +434,7 @@ static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
-static bool mirrored_kernelcore __meminitdata;
+bool mirrored_kernelcore __initdata_memblock;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
@@ -524,7 +602,7 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
{
unsigned long *bitmap;
unsigned long bitidx, word_bitidx;
- unsigned long old_word, word;
+ unsigned long word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
@@ -540,12 +618,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
- for (;;) {
- old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
- if (word == old_word)
- break;
- word = old_word;
- }
+ do {
+ } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags));
}
void set_pageblock_migratetype(struct page *page, int migratetype)
@@ -653,7 +727,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
VM_BUG_ON(order != pageblock_order);
- base = PAGE_ALLOC_COSTLY_ORDER + 1;
+ return NR_LOWORDER_PCP_LISTS;
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -667,7 +741,7 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (order > PAGE_ALLOC_COSTLY_ORDER)
+ if (pindex == NR_LOWORDER_PCP_LISTS)
order = pageblock_order;
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -744,6 +818,14 @@ void prep_compound_page(struct page *page, unsigned int order)
prep_compound_head(page, order);
}
+void destroy_large_folio(struct folio *folio)
+{
+ enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor;
+
+ VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio);
+ compound_page_dtors[dtor](&folio->page);
+}
+
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
@@ -785,7 +867,7 @@ static inline bool set_page_guard(struct zone *zone, struct page *page,
return false;
__SetPageGuard(page);
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->buddy_list);
set_page_private(page, order);
/* Guard pages are not available for any usage */
__mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -928,7 +1010,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add(&page->lru, &area->free_list[migratetype]);
+ list_add(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -938,7 +1020,7 @@ static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_add_tail(&page->lru, &area->free_list[migratetype]);
+ list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
area->nr_free++;
}
@@ -952,7 +1034,7 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
{
struct free_area *area = &zone->free_area[order];
- list_move_tail(&page->lru, &area->free_list[migratetype]);
+ list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
}
static inline void del_page_from_free_list(struct page *page, struct zone *zone,
@@ -962,7 +1044,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
if (page_reported(page))
__ClearPageReported(page);
- list_del(&page->lru);
+ list_del(&page->buddy_list);
__ClearPageBuddy(page);
set_page_private(page, 0);
zone->free_area[order].nr_free--;
@@ -1296,18 +1378,14 @@ static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags)
PageSkipKASanPoison(page);
}
-static void kernel_init_free_pages(struct page *page, int numpages)
+static void kernel_init_pages(struct page *page, int numpages)
{
int i;
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++) {
- u8 tag = page_kasan_tag(page + i);
- page_kasan_tag_reset(page + i);
- clear_highpage(page + i);
- page_kasan_tag_set(page + i, tag);
- }
+ for (i = 0; i < numpages; i++)
+ clear_highpage_kasan_tagged(page + i);
kasan_enable_current();
}
@@ -1396,7 +1474,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
init = false;
}
if (init)
- kernel_init_free_pages(page, 1 << order);
+ kernel_init_pages(page, 1 << order);
/*
* arch_free_page() can make the page's contents inaccessible. s390
@@ -1473,10 +1551,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* Ensure requested pindex is drained first. */
pindex = pindex - 1;
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
+ /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
isolated_pageblocks = has_isolate_pageblock(zone);
@@ -1504,11 +1579,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
do {
int mt;
- page = list_last_entry(list, struct page, lru);
+ page = list_last_entry(list, struct page, pcp_list);
mt = get_pcppage_migratetype(page);
/* must delete to avoid corrupting pcp list */
- list_del(&page->lru);
+ list_del(&page->pcp_list);
count -= nr_pages;
pcp->count -= nr_pages;
@@ -2361,7 +2436,7 @@ static inline bool check_new_pcp(struct page *page, unsigned int order)
}
#endif /* CONFIG_DEBUG_VM */
-static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
+static inline bool should_skip_kasan_unpoison(gfp_t flags)
{
/* Don't skip if a software KASAN mode is enabled. */
if (IS_ENABLED(CONFIG_KASAN_GENERIC) ||
@@ -2373,12 +2448,10 @@ static inline bool should_skip_kasan_unpoison(gfp_t flags, bool init_tags)
return true;
/*
- * With hardware tag-based KASAN enabled, skip if either:
- *
- * 1. Memory tags have already been cleared via tag_clear_highpage().
- * 2. Skipping has been requested via __GFP_SKIP_KASAN_UNPOISON.
+ * With hardware tag-based KASAN enabled, skip if this has been
+ * requested via __GFP_SKIP_KASAN_UNPOISON.
*/
- return init_tags || (flags & __GFP_SKIP_KASAN_UNPOISON);
+ return flags & __GFP_SKIP_KASAN_UNPOISON;
}
static inline bool should_skip_init(gfp_t flags)
@@ -2397,6 +2470,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) &&
!should_skip_init(gfp_flags);
bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS);
+ int i;
set_page_private(page, 0);
set_page_refcounted(page);
@@ -2422,8 +2496,6 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* should be initialized as well).
*/
if (init_tags) {
- int i;
-
/* Initialize both memory and tags. */
for (i = 0; i != 1 << order; ++i)
tag_clear_highpage(page + i);
@@ -2431,17 +2503,21 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
/* Note that memory is already initialized by the loop above. */
init = false;
}
- if (!should_skip_kasan_unpoison(gfp_flags, init_tags)) {
+ if (!should_skip_kasan_unpoison(gfp_flags)) {
/* Unpoison shadow memory or set memory tags. */
kasan_unpoison_pages(page, order, init);
/* Note that memory is already initialized by KASAN. */
if (kasan_has_integrated_init())
init = false;
+ } else {
+ /* Ensure page_address() dereferencing does not fault. */
+ for (i = 0; i != 1 << order; ++i)
+ page_kasan_tag_reset(page + i);
}
/* If memory is still not initialized, do it now. */
if (init)
- kernel_init_free_pages(page, 1 << order);
+ kernel_init_pages(page, 1 << order);
/* Propagate __GFP_SKIP_KASAN_POISON to page flags. */
if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON))
SetPageSkipKASanPoison(page);
@@ -3044,10 +3120,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
{
int i, allocated = 0;
- /*
- * local_lock_irq held so equivalent to spin_lock_irqsave for
- * both PREEMPT_RT and non-PREEMPT_RT configurations.
- */
+ /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
struct page *page = __rmqueue(zone, order, migratetype,
@@ -3068,7 +3141,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* for IO devices that can merge IO requests if the physical
* pages are ordered properly.
*/
- list_add_tail(&page->lru, list);
+ list_add_tail(&page->pcp_list, list);
allocated++;
if (is_migrate_cma(get_pcppage_migratetype(page)))
__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3091,51 +3164,48 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
* Called from the vmstat counter updater to drain pagesets of this
* currently executing processor on remote nodes after they have
* expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
*/
void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
{
- unsigned long flags;
int to_drain, batch;
- local_lock_irqsave(&pagesets.lock, flags);
batch = READ_ONCE(pcp->batch);
to_drain = min(pcp->count, batch);
- if (to_drain > 0)
+ if (to_drain > 0) {
+ unsigned long flags;
+
+ /*
+ * free_pcppages_bulk expects IRQs disabled for zone->lock
+ * so even though pcp->lock is not intended to be IRQ-safe,
+ * it's needed in this context.
+ */
+ spin_lock_irqsave(&pcp->lock, flags);
free_pcppages_bulk(zone, to_drain, pcp, 0);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ spin_unlock_irqrestore(&pcp->lock, flags);
+ }
}
#endif
/*
* Drain pcplists of the indicated processor and zone.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages_zone(unsigned int cpu, struct zone *zone)
{
- unsigned long flags;
struct per_cpu_pages *pcp;
- local_lock_irqsave(&pagesets.lock, flags);
-
pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu);
- if (pcp->count)
- free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ if (pcp->count) {
+ unsigned long flags;
- local_unlock_irqrestore(&pagesets.lock, flags);
+ /* See drain_zone_pages on why this is disabling IRQs */
+ spin_lock_irqsave(&pcp->lock, flags);
+ free_pcppages_bulk(zone, pcp->count, pcp, 0);
+ spin_unlock_irqrestore(&pcp->lock, flags);
+ }
}
/*
* Drain pcplists of all zones on the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
*/
static void drain_pages(unsigned int cpu)
{
@@ -3148,9 +3218,6 @@ static void drain_pages(unsigned int cpu)
/*
* Spill all of this CPU's per-cpu pages back into the buddy allocator.
- *
- * The CPU has to be pinned. When zone parameter is non-NULL, spill just
- * the single zone's pages.
*/
void drain_local_pages(struct zone *zone)
{
@@ -3162,24 +3229,6 @@ void drain_local_pages(struct zone *zone)
drain_pages(cpu);
}
-static void drain_local_pages_wq(struct work_struct *work)
-{
- struct pcpu_drain *drain;
-
- drain = container_of(work, struct pcpu_drain, work);
-
- /*
- * drain_all_pages doesn't use proper cpu hotplug protection so
- * we can race with cpu offline when the WQ can move this from
- * a cpu pinned worker to an unbound one. We can operate on a different
- * cpu which is alright but we also have to make sure to not move to
- * a different one.
- */
- migrate_disable();
- drain_local_pages(drain->zone);
- migrate_enable();
-}
-
/*
* The implementation of drain_all_pages(), exposing an extra parameter to
* drain on all cpus.
@@ -3201,13 +3250,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
static cpumask_t cpus_with_pcps;
/*
- * Make sure nobody triggers this path before mm_percpu_wq is fully
- * initialized.
- */
- if (WARN_ON_ONCE(!mm_percpu_wq))
- return;
-
- /*
* Do not drain if one is already in progress unless it's specific to
* a zone. Such callers are primarily CMA and memory hotplug and need
* the drain to be complete when the call returns.
@@ -3256,14 +3298,11 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
-
- drain->zone = zone;
- INIT_WORK(&drain->work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, &drain->work);
+ if (zone)
+ drain_pages_zone(cpu, zone);
+ else
+ drain_pages(cpu);
}
- for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
@@ -3272,8 +3311,6 @@ static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
*
* When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workqueue.
*/
void drain_all_pages(struct zone *zone)
{
@@ -3318,7 +3355,7 @@ void mark_free_pages(struct zone *zone)
for_each_migratetype_order(order, t) {
list_for_each_entry(page,
- &zone->free_area[order].free_list[t], lru) {
+ &zone->free_area[order].free_list[t], buddy_list) {
unsigned long i;
pfn = page_to_pfn(page);
@@ -3395,19 +3432,17 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
return min(READ_ONCE(pcp->batch) << 2, high);
}
-static void free_unref_page_commit(struct page *page, int migratetype,
+static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
+ struct page *page, int migratetype,
unsigned int order)
{
- struct zone *zone = page_zone(page);
- struct per_cpu_pages *pcp;
int high;
int pindex;
bool free_high;
__count_vm_event(PGFREE);
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pindex = order_to_pindex(migratetype, order);
- list_add(&page->lru, &pcp->lists[pindex]);
+ list_add(&page->pcp_list, &pcp->lists[pindex]);
pcp->count += 1 << order;
/*
@@ -3432,6 +3467,9 @@ static void free_unref_page_commit(struct page *page, int migratetype,
void free_unref_page(struct page *page, unsigned int order)
{
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
+ struct per_cpu_pages *pcp;
+ struct zone *zone;
unsigned long pfn = page_to_pfn(page);
int migratetype;
@@ -3454,9 +3492,16 @@ void free_unref_page(struct page *page, unsigned int order)
migratetype = MIGRATE_MOVABLE;
}
- local_lock_irqsave(&pagesets.lock, flags);
- free_unref_page_commit(page, migratetype, order);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ zone = page_zone(page);
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (pcp) {
+ free_unref_page_commit(zone, pcp, page, migratetype, order);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ } else {
+ free_one_page(zone, page, pfn, order, migratetype, FPI_NONE);
+ }
+ pcp_trylock_finish(UP_flags);
}
/*
@@ -3465,6 +3510,8 @@ void free_unref_page(struct page *page, unsigned int order)
void free_unref_page_list(struct list_head *list)
{
struct page *page, *next;
+ struct per_cpu_pages *pcp = NULL;
+ struct zone *locked_zone = NULL;
unsigned long flags;
int batch_count = 0;
int migratetype;
@@ -3489,8 +3536,18 @@ void free_unref_page_list(struct list_head *list)
}
}
- local_lock_irqsave(&pagesets.lock, flags);
list_for_each_entry_safe(page, next, list, lru) {
+ struct zone *zone = page_zone(page);
+
+ /* Different zone, different pcp lock. */
+ if (zone != locked_zone) {
+ if (pcp)
+ pcp_spin_unlock_irqrestore(pcp, flags);
+
+ locked_zone = zone;
+ pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
+ }
+
/*
* Non-isolated types over MIGRATE_PCPTYPES get added
* to the MIGRATE_MOVABLE pcp list.
@@ -3500,19 +3557,21 @@ void free_unref_page_list(struct list_head *list)
migratetype = MIGRATE_MOVABLE;
trace_mm_page_free_batched(page);
- free_unref_page_commit(page, migratetype, 0);
+ free_unref_page_commit(zone, pcp, page, migratetype, 0);
/*
* Guard against excessive IRQ disabled times when we get
* a large list of pages to free.
*/
if (++batch_count == SWAP_CLUSTER_MAX) {
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
batch_count = 0;
- local_lock_irqsave(&pagesets.lock, flags);
+ pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags);
}
}
- local_unlock_irqrestore(&pagesets.lock, flags);
+
+ if (pcp)
+ pcp_spin_unlock_irqrestore(pcp, flags);
}
/*
@@ -3637,6 +3696,43 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
#endif
}
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+ unsigned int order, unsigned int alloc_flags,
+ int migratetype)
+{
+ struct page *page;
+ unsigned long flags;
+
+ do {
+ page = NULL;
+ spin_lock_irqsave(&zone->lock, flags);
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER)
+ page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+ if (!page) {
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
+ if (!page) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return NULL;
+ }
+ }
+ __mod_zone_freepage_state(zone, -(1 << order),
+ get_pcppage_migratetype(page));
+ spin_unlock_irqrestore(&zone->lock, flags);
+ } while (check_new_pages(page, order));
+
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ zone_statistics(preferred_zone, zone, 1);
+
+ return page;
+}
+
/* Remove page from the per-cpu list, caller must protect the list */
static inline
struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -3670,8 +3766,8 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
return NULL;
}
- page = list_first_entry(list, struct page, lru);
- list_del(&page->lru);
+ page = list_first_entry(list, struct page, pcp_list);
+ list_del(&page->pcp_list);
pcp->count -= 1 << order;
} while (check_new_pcp(page, order));
@@ -3688,19 +3784,29 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct list_head *list;
struct page *page;
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
- local_lock_irqsave(&pagesets.lock, flags);
+ /*
+ * spin_trylock may fail due to a parallel drain. In the future, the
+ * trylock will also protect against IRQ reentrancy.
+ */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (!pcp) {
+ pcp_trylock_finish(UP_flags);
+ return NULL;
+ }
/*
* On allocation, reduce the number of pages that are batch freed.
* See nr_pcp_free() where free_factor is increased for subsequent
* frees.
*/
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp->free_factor >>= 1;
list = &pcp->lists[order_to_pindex(migratetype, order)];
page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_trylock_finish(UP_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone, 1);
@@ -3717,9 +3823,14 @@ struct page *rmqueue(struct zone *preferred_zone,
gfp_t gfp_flags, unsigned int alloc_flags,
int migratetype)
{
- unsigned long flags;
struct page *page;
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
if (likely(pcp_allowed_order(order))) {
/*
* MIGRATE_MOVABLE pcplist could have the pages on CMA area and
@@ -3729,53 +3840,23 @@ struct page *rmqueue(struct zone *preferred_zone,
migratetype != MIGRATE_MOVABLE) {
page = rmqueue_pcplist(preferred_zone, zone, order,
gfp_flags, migratetype, alloc_flags);
- goto out;
+ if (likely(page))
+ goto out;
}
}
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
- do {
- page = NULL;
- spin_lock_irqsave(&zone->lock, flags);
- /*
- * order-0 request can reach here when the pcplist is skipped
- * due to non-CMA allocation context. HIGHATOMIC area is
- * reserved for high-order atomic allocation, so order-0
- * request should skip it.
- */
- if (order > 0 && alloc_flags & ALLOC_HARDER)
- page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
- if (!page) {
- page = __rmqueue(zone, order, migratetype, alloc_flags);
- if (!page)
- goto failed;
- }
- __mod_zone_freepage_state(zone, -(1 << order),
- get_pcppage_migratetype(page));
- spin_unlock_irqrestore(&zone->lock, flags);
- } while (check_new_pages(page, order));
-
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
- zone_statistics(preferred_zone, zone, 1);
+ page = rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+ migratetype);
out:
/* Separate test+clear to avoid unnecessary atomics */
- if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
wakeup_kswapd(zone, 0, 0, zone_idx(zone));
}
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
-
-failed:
- spin_unlock_irqrestore(&zone->lock, flags);
- return NULL;
}
#ifdef CONFIG_FAIL_PAGE_ALLOC
@@ -3968,11 +4049,15 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
* need to be calculated.
*/
if (!order) {
- long fast_free;
+ long usable_free;
+ long reserved;
- fast_free = free_pages;
- fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
- if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ usable_free = free_pages;
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
+
+ /* reserved may over estimate high-atomic reserves. */
+ usable_free -= min(usable_free, reserved);
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
@@ -4090,7 +4175,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
retry:
/*
* Scan zonelist, looking for a zone with enough free.
- * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
+ * See also __cpuset_node_allowed() comment in kernel/cgroup/cpuset.c.
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
@@ -5197,10 +5282,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
*alloc_flags |= ALLOC_CPUSET;
}
- fs_reclaim_acquire(gfp_mask);
- fs_reclaim_release(gfp_mask);
-
- might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
+ might_alloc(gfp_mask);
if (should_fail_alloc_page(gfp_mask, order))
return false;
@@ -5248,6 +5330,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
{
struct page *page;
unsigned long flags;
+ unsigned long __maybe_unused UP_flags;
struct zone *zone;
struct zoneref *z;
struct per_cpu_pages *pcp;
@@ -5328,11 +5411,14 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
if (unlikely(!zone))
goto failed;
+ /* Is a parallel drain in progress? */
+ pcp_trylock_prepare(UP_flags);
+ pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags);
+ if (!pcp)
+ goto failed_irq;
+
/* Attempt the batch allocation */
- local_lock_irqsave(&pagesets.lock, flags);
- pcp = this_cpu_ptr(zone->per_cpu_pageset);
pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
-
while (nr_populated < nr_pages) {
/* Skip existing pages */
@@ -5345,8 +5431,10 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
pcp, pcp_list);
if (unlikely(!page)) {
/* Try and allocate at least one page */
- if (!nr_account)
+ if (!nr_account) {
+ pcp_spin_unlock_irqrestore(pcp, flags);
goto failed_irq;
+ }
break;
}
nr_account++;
@@ -5359,7 +5447,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
nr_populated++;
}
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_spin_unlock_irqrestore(pcp, flags);
+ pcp_trylock_finish(UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
@@ -5368,7 +5457,7 @@ out:
return nr_populated;
failed_irq:
- local_unlock_irqrestore(&pagesets.lock, flags);
+ pcp_trylock_finish(UP_flags);
failed:
page = __alloc_pages(gfp, 0, preferred_nid, nodemask);
@@ -5799,14 +5888,14 @@ long si_mem_available(void)
/*
* Estimate the amount of memory available for userspace allocations,
- * without causing swapping.
+ * without causing swapping or OOM.
*/
available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
/*
* Not all the page cache can be freed, otherwise the system will
- * start swapping. Assume at least half of the page cache, or the
- * low watermark worth of cache, needs to stay.
+ * start swapping or thrashing. Assume at least half of the page
+ * cache, or the low watermark worth of cache, needs to stay.
*/
pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
pagecache -= min(pagecache / 2, wmark_low);
@@ -5934,7 +6023,7 @@ static void show_migration_types(unsigned char type)
void show_free_areas(unsigned int filter, nodemask_t *nodemask)
{
unsigned long free_pcp = 0;
- int cpu;
+ int cpu, nid;
struct zone *zone;
pg_data_t *pgdat;
@@ -6122,7 +6211,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk(KERN_CONT "= %lukB\n", K(total));
}
- hugetlb_show_meminfo();
+ for_each_online_node(nid) {
+ if (show_mem_node_skip(filter, nid, nodemask))
+ continue;
+ hugetlb_show_meminfo_node(nid);
+ }
printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
@@ -7008,6 +7101,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta
memset(pcp, 0, sizeof(*pcp));
memset(pzstats, 0, sizeof(*pzstats));
+ spin_lock_init(&pcp->lock);
for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
INIT_LIST_HEAD(&pcp->lists[pindex]);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index d200d41ad0d3..9d73dc38e3d7 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -286,6 +286,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @flags: isolation flags
* @gfp_flags: GFP flags used for migrating pages
* @isolate_before: isolate the pageblock before the boundary_pfn
+ * @skip_isolation: the flag to skip the pageblock isolation in second
+ * isolate_single_pageblock()
*
* Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index c10f839fc410..8e9e574d535a 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -174,8 +174,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
if (!pvmw->pte)
return false;
- pvmw->ptl = huge_pte_lockptr(hstate, mm, pvmw->pte);
- spin_lock(pvmw->ptl);
+ pvmw->ptl = huge_pte_lock(hstate, mm, pvmw->pte);
if (!check_pte(pvmw))
return not_found(pvmw);
return true;
@@ -243,7 +242,7 @@ restart:
* cleared *pmd but not decremented compound_mapcount().
*/
if ((pvmw->flags & PVMW_SYNC) &&
- transparent_hugepage_active(vma) &&
+ transhuge_vma_suitable(vma, pvmw->address) &&
(pvmw->nr_pages >= HPAGE_PMD_NR)) {
spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
diff --git a/mm/percpu.c b/mm/percpu.c
index 3633eeefaa0d..27697b2429c2 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -3104,7 +3104,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
goto out_free_areas;
}
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(ptr);
+ kmemleak_ignore_phys(__pa(ptr));
areas[group] = ptr;
base = min(ptr, base);
@@ -3304,7 +3304,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t
goto enomem;
}
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(ptr);
+ kmemleak_ignore_phys(__pa(ptr));
pages[j++] = virt_to_page(ptr);
}
}
@@ -3417,7 +3417,7 @@ void __init setup_per_cpu_areas(void)
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
- kmemleak_free(fc);
+ kmemleak_ignore_phys(__pa(fc));
ai->dyn_size = unit_size;
ai->unit_size = unit_size;
diff --git a/mm/readahead.c b/mm/readahead.c
index 57a015108254..fdcd28cbd92d 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -510,6 +510,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
new_order--;
}
+ filemap_invalidate_lock_shared(mapping);
while (index <= limit) {
unsigned int order = new_order;
@@ -536,6 +537,7 @@ void page_cache_ra_order(struct readahead_control *ractl,
}
read_pages(ractl);
+ filemap_invalidate_unlock_shared(mapping);
/*
* If there were already pages in the page cache, then we may have
diff --git a/mm/rmap.c b/mm/rmap.c
index 5bcb334cd6f2..edc06c52bc82 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -999,7 +999,7 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
* downgrading page table protection not changing it to point
* to a new page.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
if (ret)
cleaned++;
@@ -1537,6 +1537,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio)) {
+ bool anon = folio_test_anon(folio);
+
/*
* The try_to_unmap() is only passed a hugetlb page
* in the case where the hugetlb page is poisoned.
@@ -1551,31 +1553,28 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
*/
flush_cache_range(vma, range.start, range.end);
- if (!folio_test_anon(folio)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
+ if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
} else {
@@ -1619,9 +1618,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
- set_huge_swap_pte_at(mm, address,
- pvmw.pte, pteval,
- vma_mmu_pagesize(vma));
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
} else {
dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -1765,7 +1762,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
* to point at a new folio while a device is
* still using this folio.
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
dec_mm_counter(mm, mm_counter_file(&folio->page));
}
@@ -1775,7 +1772,7 @@ discard:
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
@@ -1899,13 +1896,30 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Unexpected PMD-mapped THP? */
VM_BUG_ON_FOLIO(!pvmw.pte, folio);
- subpage = folio_page(folio,
- pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ if (folio_is_zone_device(folio)) {
+ /*
+ * Our PTE is a non-present device exclusive entry and
+ * calculating the subpage as for the common case would
+ * result in an invalid pointer.
+ *
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
+ */
+ VM_BUG_ON_FOLIO(folio_nr_pages(folio) > 1, folio);
+ subpage = &folio->page;
+ } else {
+ subpage = folio_page(folio,
+ pte_pfn(*pvmw.pte) - folio_pfn(folio));
+ }
address = pvmw.address;
anon_exclusive = folio_test_anon(folio) &&
PageAnonExclusive(subpage);
if (folio_test_hugetlb(folio)) {
+ bool anon = folio_test_anon(folio);
+
/*
* huge_pmd_unshare may unmap an entire PMD page.
* There is no way of knowing exactly which PMDs may
@@ -1915,31 +1929,28 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
*/
flush_cache_range(vma, range.start, range.end);
- if (!folio_test_anon(folio)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED));
+ if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
-
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
-
- /*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
- */
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
+ page_vma_mapped_walk_done(&pvmw);
+ break;
}
/* Nuke the hugetlb page table entry */
@@ -1957,7 +1968,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/* Update high watermark before we lower rss */
update_hiwater_rss(mm);
- if (folio_is_zone_device(folio)) {
+ if (folio_is_device_private(folio)) {
unsigned long pfn = folio_pfn(folio);
swp_entry_t entry;
pte_t swp_pte;
@@ -1993,22 +2004,12 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
- *
- * The assignment to subpage above was computed from a
- * swap PTE which results in an invalid pointer.
- * Since only PAGE_SIZE pages can currently be
- * migrated, just set it to page. This will need to be
- * changed when hugepage migrations to device private
- * memory are supported.
*/
- subpage = &folio->page;
} else if (PageHWPoison(subpage)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (folio_test_hugetlb(folio)) {
hugetlb_count_sub(folio_nr_pages(folio), mm);
- set_huge_swap_pte_at(mm, address,
- pvmw.pte, pteval,
- vma_mmu_pagesize(vma));
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
} else {
dec_mm_counter(mm, mm_counter(&folio->page));
set_pte_at(mm, address, pvmw.pte, pteval);
@@ -2076,8 +2077,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
if (folio_test_hugetlb(folio))
- set_huge_swap_pte_at(mm, address, pvmw.pte,
- swp_pte, vma_mmu_pagesize(vma));
+ set_huge_pte_at(mm, address, pvmw.pte, swp_pte);
else
set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
@@ -2093,7 +2093,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
* done above for all cases requiring it to happen under page
* table lock before mmu_notifier_invalidate_range_end()
*
- * See Documentation/vm/mmu_notifier.rst
+ * See Documentation/mm/mmu_notifier.rst
*/
page_remove_rmap(subpage, vma, folio_test_hugetlb(folio));
if (vma->vm_flags & VM_LOCKED)
@@ -2131,7 +2131,8 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags)
TTU_SYNC)))
return;
- if (folio_is_zone_device(folio) && !folio_is_device_private(folio))
+ if (folio_is_zone_device(folio) &&
+ (!folio_is_device_private(folio) && !folio_is_device_coherent(folio)))
return;
/*
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 206ed6b40c1d..e3e9590c6fb3 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -55,22 +55,28 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf)
gfp_t gfp = vmf->gfp_mask;
unsigned long addr;
struct page *page;
+ vm_fault_t ret;
int err;
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return vmf_error(-EINVAL);
+ filemap_invalidate_lock_shared(mapping);
+
retry:
page = find_lock_page(mapping, offset);
if (!page) {
page = alloc_page(gfp | __GFP_ZERO);
- if (!page)
- return VM_FAULT_OOM;
+ if (!page) {
+ ret = VM_FAULT_OOM;
+ goto out;
+ }
err = set_direct_map_invalid_noflush(page);
if (err) {
put_page(page);
- return vmf_error(err);
+ ret = vmf_error(err);
+ goto out;
}
__SetPageUptodate(page);
@@ -86,7 +92,8 @@ retry:
if (err == -EEXIST)
goto retry;
- return vmf_error(err);
+ ret = vmf_error(err);
+ goto out;
}
addr = (unsigned long)page_address(page);
@@ -94,7 +101,11 @@ retry:
}
vmf->page = page;
- return VM_FAULT_LOCKED;
+ ret = VM_FAULT_LOCKED;
+
+out:
+ filemap_invalidate_unlock_shared(mapping);
+ return ret;
}
static const struct vm_operations_struct secretmem_vm_ops = {
@@ -133,14 +144,8 @@ static const struct file_operations secretmem_fops = {
.mmap = secretmem_mmap,
};
-static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
-{
- return false;
-}
-
-static int secretmem_migratepage(struct address_space *mapping,
- struct page *newpage, struct page *page,
- enum migrate_mode mode)
+static int secretmem_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src, enum migrate_mode mode)
{
return -EBUSY;
}
@@ -154,20 +159,27 @@ static void secretmem_free_folio(struct folio *folio)
const struct address_space_operations secretmem_aops = {
.dirty_folio = noop_dirty_folio,
.free_folio = secretmem_free_folio,
- .migratepage = secretmem_migratepage,
- .isolate_page = secretmem_isolate_page,
+ .migrate_folio = secretmem_migrate_folio,
};
static int secretmem_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
+ struct address_space *mapping = inode->i_mapping;
unsigned int ia_valid = iattr->ia_valid;
+ int ret;
+
+ filemap_invalidate_lock(mapping);
if ((ia_valid & ATTR_SIZE) && inode->i_size)
- return -EINVAL;
+ ret = -EINVAL;
+ else
+ ret = simple_setattr(mnt_userns, dentry, iattr);
+
+ filemap_invalidate_unlock(mapping);
- return simple_setattr(mnt_userns, dentry, iattr);
+ return ret;
}
static const struct inode_operations secretmem_iops = {
@@ -180,11 +192,20 @@ static struct file *secretmem_file_create(unsigned long flags)
{
struct file *file = ERR_PTR(-ENOMEM);
struct inode *inode;
+ const char *anon_name = "[secretmem]";
+ const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
+ int err;
inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
if (IS_ERR(inode))
return ERR_CAST(inode);
+ err = security_inode_init_security_anon(inode, &qname, NULL);
+ if (err) {
+ file = ERR_PTR(err);
+ goto err_free_inode;
+ }
+
file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
O_RDWR, &secretmem_fops);
if (IS_ERR(file))
diff --git a/mm/shmem.c b/mm/shmem.c
index a6f565308133..5783f11351bb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fileattr.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
@@ -392,7 +393,7 @@ void shmem_uncharge(struct inode *inode, long pages)
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long flags;
- /* nrpages adjustment done by __delete_from_page_cache() or caller */
+ /* nrpages adjustment done by __filemap_remove_folio() or caller */
spin_lock_irqsave(&info->lock, flags);
info->alloced -= pages;
@@ -693,7 +694,7 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Like add_to_page_cache_locked, but error if expected item has gone.
+ * Like filemap_add_folio, but error if expected item has gone.
*/
static int shmem_add_to_page_cache(struct folio *folio,
struct address_space *mapping,
@@ -867,18 +868,17 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
*/
void shmem_unlock_mapping(struct address_space *mapping)
{
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t index = 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
/*
* Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/
- while (!mapping_unevictable(mapping)) {
- if (!pagevec_lookup(&pvec, mapping, &index))
- break;
- check_move_unevictable_pages(&pvec);
- pagevec_release(&pvec);
+ while (!mapping_unevictable(mapping) &&
+ filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
+ check_move_unevictable_folios(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
}
}
@@ -1058,6 +1058,15 @@ static int shmem_getattr(struct user_namespace *mnt_userns,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
}
+ if (info->fsflags & FS_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (info->fsflags & FS_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (info->fsflags & FS_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
generic_fillattr(&init_user_ns, inode, stat);
if (shmem_is_huge(NULL, inode, 0))
@@ -1691,7 +1700,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
return;
folio_wait_writeback(folio);
- delete_from_swap_cache(&folio->page);
+ delete_from_swap_cache(folio);
spin_lock_irq(&info->lock);
/*
* Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
@@ -1706,10 +1715,10 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
}
/*
- * Swap in the page pointed to by *pagep.
- * Caller has to make sure that *pagep contains a valid swapped page.
- * Returns 0 and the page in pagep if success. On failure, returns the
- * error code and NULL in *pagep.
+ * Swap in the folio pointed to by *foliop.
+ * Caller has to make sure that *foliop contains a valid swapped folio.
+ * Returns 0 and the folio in foliop if success. On failure, returns the
+ * error code and NULL in *foliop.
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
@@ -1749,7 +1758,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
}
folio = page_folio(page);
- /* We have to do this with page locked to prevent races */
+ /* We have to do this with folio locked to prevent races */
folio_lock(folio);
if (!folio_test_swapcache(folio) ||
folio_swap_entry(folio).val != swap.val ||
@@ -1789,7 +1798,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
- delete_from_swap_cache(&folio->page);
+ delete_from_swap_cache(folio);
folio_mark_dirty(folio);
swap_free(swap);
@@ -2272,7 +2281,18 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
return 0;
}
-static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+/* Mask out flags that are inappropriate for the given type of inode. */
+static unsigned shmem_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & SHMEM_REG_FLMASK;
+ else
+ return flags & SHMEM_OTHER_FLMASK;
+}
+
+static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t dev, unsigned long flags)
{
struct inode *inode;
@@ -2297,6 +2317,9 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
info->i_crtime = inode->i_mtime;
+ info->fsflags = (dir == NULL) ? 0 :
+ SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+ info->fsflags = shmem_mask_flags(mode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
@@ -2603,7 +2626,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
ret = copy_page_to_iter(page, offset, nr, to);
put_page(page);
- } else if (iter_is_iovec(to)) {
+ } else if (user_backed_iter(to)) {
/*
* Copy to user tends to be so well optimized, but
* clear_user() not so much, that it is noticeably
@@ -3138,6 +3161,40 @@ static const char *shmem_get_link(struct dentry *dentry,
}
#ifdef CONFIG_TMPFS_XATTR
+
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
+
+ return 0;
+}
+
+static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
+
+ info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+ (fa->flags & SHMEM_FL_USER_MODIFIABLE);
+
+ inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME);
+ if (info->fsflags & FS_APPEND_FL)
+ inode->i_flags |= S_APPEND;
+ if (info->fsflags & FS_IMMUTABLE_FL)
+ inode->i_flags |= S_IMMUTABLE;
+ if (info->fsflags & FS_NOATIME_FL)
+ inode->i_flags |= S_NOATIME;
+
+ inode->i_ctime = current_time(inode);
+ return 0;
+}
+
/*
* Superblocks without xattr inode operations may get some security.* xattr
* support from the LSM "for free". As soon as we have any other xattrs
@@ -3392,7 +3449,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->blocks > S64_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
@@ -3514,10 +3571,7 @@ static int shmem_reconfigure(struct fs_context *fc)
raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (ctx->blocks > S64_MAX) {
- err = "Number of blocks too large";
- goto out;
- }
+
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
err = "Cannot retroactively limit size";
@@ -3802,7 +3856,7 @@ const struct address_space_operations shmem_aops = {
.write_end = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
- .migratepage = migrate_page,
+ .migrate_folio = migrate_folio,
#endif
.error_remove_page = shmem_error_remove_page,
};
@@ -3828,6 +3882,8 @@ static const struct inode_operations shmem_inode_operations = {
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
.set_acl = simple_set_acl,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#endif
};
@@ -3847,6 +3903,8 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
new file mode 100644
index 000000000000..b05295bab322
--- /dev/null
+++ b/mm/shrinker_debug.c
@@ -0,0 +1,286 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/shrinker.h>
+#include <linux/memcontrol.h>
+
+/* defined in vmscan.c */
+extern struct rw_semaphore shrinker_rwsem;
+extern struct list_head shrinker_list;
+
+static DEFINE_IDA(shrinker_debugfs_ida);
+static struct dentry *shrinker_debugfs_root;
+
+static unsigned long shrinker_count_objects(struct shrinker *shrinker,
+ struct mem_cgroup *memcg,
+ unsigned long *count_per_node)
+{
+ unsigned long nr, total = 0;
+ int nid;
+
+ for_each_node(nid) {
+ if (nid == 0 || (shrinker->flags & SHRINKER_NUMA_AWARE)) {
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .nid = nid,
+ .memcg = memcg,
+ };
+
+ nr = shrinker->count_objects(shrinker, &sc);
+ if (nr == SHRINK_EMPTY)
+ nr = 0;
+ } else {
+ nr = 0;
+ }
+
+ count_per_node[nid] = nr;
+ total += nr;
+ }
+
+ return total;
+}
+
+static int shrinker_debugfs_count_show(struct seq_file *m, void *v)
+{
+ struct shrinker *shrinker = m->private;
+ unsigned long *count_per_node;
+ struct mem_cgroup *memcg;
+ unsigned long total;
+ bool memcg_aware;
+ int ret, nid;
+
+ count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
+ if (!count_per_node)
+ return -ENOMEM;
+
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ kfree(count_per_node);
+ return ret;
+ }
+ rcu_read_lock();
+
+ memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE;
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ if (memcg && !mem_cgroup_online(memcg))
+ continue;
+
+ total = shrinker_count_objects(shrinker,
+ memcg_aware ? memcg : NULL,
+ count_per_node);
+ if (total) {
+ seq_printf(m, "%lu", mem_cgroup_ino(memcg));
+ for_each_node(nid)
+ seq_printf(m, " %lu", count_per_node[nid]);
+ seq_putc(m, '\n');
+ }
+
+ if (!memcg_aware) {
+ mem_cgroup_iter_break(NULL, memcg);
+ break;
+ }
+
+ if (signal_pending(current)) {
+ mem_cgroup_iter_break(NULL, memcg);
+ ret = -EINTR;
+ break;
+ }
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+
+ rcu_read_unlock();
+ up_read(&shrinker_rwsem);
+
+ kfree(count_per_node);
+ return ret;
+}
+DEFINE_SHOW_ATTRIBUTE(shrinker_debugfs_count);
+
+static int shrinker_debugfs_scan_open(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return nonseekable_open(inode, file);
+}
+
+static ssize_t shrinker_debugfs_scan_write(struct file *file,
+ const char __user *buf,
+ size_t size, loff_t *pos)
+{
+ struct shrinker *shrinker = file->private_data;
+ unsigned long nr_to_scan = 0, ino, read_len;
+ struct shrink_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ };
+ struct mem_cgroup *memcg = NULL;
+ int nid;
+ char kbuf[72];
+ ssize_t ret;
+
+ read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
+ if (copy_from_user(kbuf, buf, read_len))
+ return -EFAULT;
+ kbuf[read_len] = '\0';
+
+ if (sscanf(kbuf, "%lu %d %lu", &ino, &nid, &nr_to_scan) != 3)
+ return -EINVAL;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return -EINVAL;
+
+ if (nr_to_scan == 0)
+ return size;
+
+ if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+ memcg = mem_cgroup_get_from_ino(ino);
+ if (!memcg || IS_ERR(memcg))
+ return -ENOENT;
+
+ if (!mem_cgroup_online(memcg)) {
+ mem_cgroup_put(memcg);
+ return -ENOENT;
+ }
+ } else if (ino != 0) {
+ return -EINVAL;
+ }
+
+ ret = down_read_killable(&shrinker_rwsem);
+ if (ret) {
+ mem_cgroup_put(memcg);
+ return ret;
+ }
+
+ sc.nid = nid;
+ sc.memcg = memcg;
+ sc.nr_to_scan = nr_to_scan;
+ sc.nr_scanned = nr_to_scan;
+
+ shrinker->scan_objects(shrinker, &sc);
+
+ up_read(&shrinker_rwsem);
+ mem_cgroup_put(memcg);
+
+ return size;
+}
+
+static const struct file_operations shrinker_debugfs_scan_fops = {
+ .owner = THIS_MODULE,
+ .open = shrinker_debugfs_scan_open,
+ .write = shrinker_debugfs_scan_write,
+};
+
+int shrinker_debugfs_add(struct shrinker *shrinker)
+{
+ struct dentry *entry;
+ char buf[128];
+ int id;
+
+ lockdep_assert_held(&shrinker_rwsem);
+
+ /* debugfs isn't initialized yet, add debugfs entries later. */
+ if (!shrinker_debugfs_root)
+ return 0;
+
+ id = ida_alloc(&shrinker_debugfs_ida, GFP_KERNEL);
+ if (id < 0)
+ return id;
+ shrinker->debugfs_id = id;
+
+ snprintf(buf, sizeof(buf), "%s-%d", shrinker->name, id);
+
+ /* create debugfs entry */
+ entry = debugfs_create_dir(buf, shrinker_debugfs_root);
+ if (IS_ERR(entry)) {
+ ida_free(&shrinker_debugfs_ida, id);
+ return PTR_ERR(entry);
+ }
+ shrinker->debugfs_entry = entry;
+
+ debugfs_create_file("count", 0220, entry, shrinker,
+ &shrinker_debugfs_count_fops);
+ debugfs_create_file("scan", 0440, entry, shrinker,
+ &shrinker_debugfs_scan_fops);
+ return 0;
+}
+
+int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...)
+{
+ struct dentry *entry;
+ char buf[128];
+ const char *new, *old;
+ va_list ap;
+ int ret = 0;
+
+ va_start(ap, fmt);
+ new = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+
+ if (!new)
+ return -ENOMEM;
+
+ down_write(&shrinker_rwsem);
+
+ old = shrinker->name;
+ shrinker->name = new;
+
+ if (shrinker->debugfs_entry) {
+ snprintf(buf, sizeof(buf), "%s-%d", shrinker->name,
+ shrinker->debugfs_id);
+
+ entry = debugfs_rename(shrinker_debugfs_root,
+ shrinker->debugfs_entry,
+ shrinker_debugfs_root, buf);
+ if (IS_ERR(entry))
+ ret = PTR_ERR(entry);
+ else
+ shrinker->debugfs_entry = entry;
+ }
+
+ up_write(&shrinker_rwsem);
+
+ kfree_const(old);
+
+ return ret;
+}
+EXPORT_SYMBOL(shrinker_debugfs_rename);
+
+void shrinker_debugfs_remove(struct shrinker *shrinker)
+{
+ lockdep_assert_held(&shrinker_rwsem);
+
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+
+ if (!shrinker->debugfs_entry)
+ return;
+
+ debugfs_remove_recursive(shrinker->debugfs_entry);
+ ida_free(&shrinker_debugfs_ida, shrinker->debugfs_id);
+}
+
+static int __init shrinker_debugfs_init(void)
+{
+ struct shrinker *shrinker;
+ struct dentry *dentry;
+ int ret = 0;
+
+ dentry = debugfs_create_dir("shrinker", NULL);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+ shrinker_debugfs_root = dentry;
+
+ /* Create debugfs entries for shrinkers registered at boot */
+ down_write(&shrinker_rwsem);
+ list_for_each_entry(shrinker, &shrinker_list, list)
+ if (!shrinker->debugfs_entry) {
+ ret = shrinker_debugfs_add(shrinker);
+ if (ret)
+ break;
+ }
+ up_write(&shrinker_rwsem);
+
+ return ret;
+}
+late_initcall(shrinker_debugfs_init);
diff --git a/mm/slab.c b/mm/slab.c
index f8cd00f4ba13..10e96137b44f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2958,12 +2958,6 @@ direct_grow:
return ac->entry[--ac->avail];
}
-static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
- gfp_t flags)
-{
- might_sleep_if(gfpflags_allow_blocking(flags));
-}
-
#if DEBUG
static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
gfp_t flags, void *objp, unsigned long caller)
@@ -3205,7 +3199,6 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
if (unlikely(ptr))
goto out_hooks;
- cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
if (nodeid == NUMA_NO_NODE)
@@ -3230,7 +3223,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_
}
/* ___cache_alloc_node can fall back to other nodes */
ptr = ____cache_alloc_node(cachep, flags, nodeid);
- out:
+out:
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
init = slab_want_init_on_alloc(flags, cachep);
@@ -3259,7 +3252,7 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
if (!objp)
objp = ____cache_alloc_node(cache, flags, numa_mem_id());
- out:
+out:
return objp;
}
#else
@@ -3290,7 +3283,6 @@ slab_alloc(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags,
if (unlikely(objp))
goto out;
- cache_alloc_debugcheck_before(cachep, flags);
local_irq_save(save_flags);
objp = __do_cache_alloc(cachep, flags);
local_irq_restore(save_flags);
@@ -3406,9 +3398,10 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
{
bool init;
+ memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1);
+
if (is_kfence_address(objp)) {
kmemleak_free_recursive(objp, cachep->flags);
- memcg_slab_free_hook(cachep, &objp, 1);
__kfence_free(objp);
return;
}
@@ -3441,7 +3434,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
check_irq_off();
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
- memcg_slab_free_hook(cachep, &objp, 1);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3478,7 +3470,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru,
{
void *ret = slab_alloc(cachep, lru, flags, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc(_RET_IP_, ret,
+ trace_kmem_cache_alloc(_RET_IP_, ret, cachep,
cachep->object_size, cachep->size, flags);
return ret;
@@ -3527,8 +3519,6 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
if (!s)
return 0;
- cache_alloc_debugcheck_before(s, flags);
-
local_irq_disable();
for (i = 0; i < size; i++) {
void *objp = kfence_alloc(s, s->object_size, flags) ?: __do_cache_alloc(s, flags);
@@ -3553,7 +3543,7 @@ error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
slab_post_alloc_hook(s, objcg, flags, i, p, false);
- __kmem_cache_free_bulk(s, i, p);
+ kmem_cache_free_bulk(s, i, p);
return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -3567,7 +3557,7 @@ kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
ret = slab_alloc(cachep, NULL, flags, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(_RET_IP_, ret,
+ trace_kmalloc(_RET_IP_, ret, cachep,
size, cachep->size, flags);
return ret;
}
@@ -3592,7 +3582,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, cachep->object_size, _RET_IP_);
- trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ trace_kmem_cache_alloc_node(_RET_IP_, ret, cachep,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3611,7 +3601,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
ret = slab_alloc_node(cachep, flags, nodeid, size, _RET_IP_);
ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(_RET_IP_, ret, cachep,
size, cachep->size,
flags, nodeid);
return ret;
@@ -3694,7 +3684,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
ret = slab_alloc(cachep, NULL, flags, size, caller);
ret = kasan_kmalloc(cachep, ret, size, flags);
- trace_kmalloc(caller, ret,
+ trace_kmalloc(caller, ret, cachep,
size, cachep->size, flags);
return ret;
diff --git a/mm/slab.h b/mm/slab.h
index db9fb5c8dae7..4ec82bec15ec 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -380,15 +380,6 @@ void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
-/*
- * Generic implementation of bulk operations
- * These are useful for situations in which the allocator cannot
- * perform optimizations. In that case segments of the object listed
- * may be allocated or freed using these operations.
- */
-void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
-int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-
static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -547,36 +538,22 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
obj_cgroup_put(objcg);
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
- struct kmem_cache *s;
struct obj_cgroup **objcgs;
- struct obj_cgroup *objcg;
- struct slab *slab;
- unsigned int off;
int i;
if (!memcg_kmem_enabled())
return;
- for (i = 0; i < objects; i++) {
- if (unlikely(!p[i]))
- continue;
-
- slab = virt_to_slab(p[i]);
- /* we could be given a kmalloc_large() object, skip those */
- if (!slab)
- continue;
-
- objcgs = slab_objcgs(slab);
- if (!objcgs)
- continue;
+ objcgs = slab_objcgs(slab);
+ if (!objcgs)
+ return;
- if (!s_orig)
- s = slab->slab_cache;
- else
- s = s_orig;
+ for (i = 0; i < objects; i++) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
off = obj_to_index(s, slab, p[i]);
objcg = objcgs[off];
@@ -628,7 +605,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
{
}
-static inline void memcg_slab_free_hook(struct kmem_cache *s,
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void **p, int objects)
{
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 77c3adf40e50..17996649cfe3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,13 +26,12 @@
#include <linux/memcontrol.h>
#include <linux/stackdepot.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
-
#include "internal.h"
-
#include "slab.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
enum slab_state slab_state;
LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
@@ -105,33 +104,6 @@ static inline int kmem_cache_sanity_check(const char *name, unsigned int size)
}
#endif
-void __kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
-{
- size_t i;
-
- for (i = 0; i < nr; i++) {
- if (s)
- kmem_cache_free(s, p[i]);
- else
- kfree(p[i]);
- }
-}
-
-int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
- void **p)
-{
- size_t i;
-
- for (i = 0; i < nr; i++) {
- void *x = p[i] = kmem_cache_alloc(s, flags);
- if (!x) {
- __kmem_cache_free_bulk(s, i, p);
- return 0;
- }
- }
- return i;
-}
-
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
@@ -959,7 +931,7 @@ EXPORT_SYMBOL(kmalloc_order);
void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
{
void *ret = kmalloc_order(size, flags, order);
- trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+ trace_kmalloc(_RET_IP_, ret, NULL, size, PAGE_SIZE << order, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order_trace);
diff --git a/mm/slob.c b/mm/slob.c
index f47811f09aca..2bd4f476c340 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -507,7 +507,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
*m = size;
ret = (void *)m + minalign;
- trace_kmalloc_node(caller, ret,
+ trace_kmalloc_node(caller, ret, NULL,
size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -516,7 +516,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp |= __GFP_COMP;
ret = slob_new_pages(gfp, order, node);
- trace_kmalloc_node(caller, ret,
+ trace_kmalloc_node(caller, ret, NULL,
size, PAGE_SIZE << order, gfp, node);
}
@@ -616,12 +616,12 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
} else {
b = slob_new_pages(flags, get_order(c->size), node);
- trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
+ trace_kmem_cache_alloc_node(_RET_IP_, b, NULL, c->object_size,
PAGE_SIZE << get_order(c->size),
flags, node);
}
@@ -692,16 +692,33 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
}
EXPORT_SYMBOL(kmem_cache_free);
-void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
+void kmem_cache_free_bulk(struct kmem_cache *s, size_t nr, void **p)
{
- __kmem_cache_free_bulk(s, size, p);
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ if (s)
+ kmem_cache_free(s, p[i]);
+ else
+ kfree(p[i]);
+ }
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
+int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
void **p)
{
- return __kmem_cache_alloc_bulk(s, flags, size, p);
+ size_t i;
+
+ for (i = 0; i < nr; i++) {
+ void *x = p[i] = kmem_cache_alloc(s, flags);
+
+ if (!x) {
+ kmem_cache_free_bulk(s, i, p);
+ return 0;
+ }
+ }
+ return i;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
diff --git a/mm/slub.c b/mm/slub.c
index e5535020e0fd..862dbd9af4f5 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -726,25 +726,48 @@ static struct track *get_track(struct kmem_cache *s, void *object,
return kasan_reset_tag(p + alloc);
}
-static void noinline set_track(struct kmem_cache *s, void *object,
- enum track_item alloc, unsigned long addr)
-{
- struct track *p = get_track(s, object, alloc);
-
#ifdef CONFIG_STACKDEPOT
+static noinline depot_stack_handle_t set_track_prepare(void)
+{
+ depot_stack_handle_t handle;
unsigned long entries[TRACK_ADDRS_COUNT];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
- p->handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+ handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
+
+ return handle;
+}
+#else
+static inline depot_stack_handle_t set_track_prepare(void)
+{
+ return 0;
+}
#endif
+static void set_track_update(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr,
+ depot_stack_handle_t handle)
+{
+ struct track *p = get_track(s, object, alloc);
+
+#ifdef CONFIG_STACKDEPOT
+ p->handle = handle;
+#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
}
+static __always_inline void set_track(struct kmem_cache *s, void *object,
+ enum track_item alloc, unsigned long addr)
+{
+ depot_stack_handle_t handle = set_track_prepare();
+
+ set_track_update(s, object, alloc, addr, handle);
+}
+
static void init_tracking(struct kmem_cache *s, void *object)
{
struct track *p;
@@ -1373,6 +1396,10 @@ static noinline int free_debug_processing(
int cnt = 0;
unsigned long flags, flags2;
int ret = 0;
+ depot_stack_handle_t handle = 0;
+
+ if (s->flags & SLAB_STORE_USER)
+ handle = set_track_prepare();
spin_lock_irqsave(&n->list_lock, flags);
slab_lock(slab, &flags2);
@@ -1391,7 +1418,7 @@ next_object:
}
if (s->flags & SLAB_STORE_USER)
- set_track(s, object, TRACK_FREE, addr);
+ set_track_update(s, object, TRACK_FREE, addr, handle);
trace(s, slab, object, 0);
/* Freepointer not overwritten by init_object(), SLAB_POISON moved it */
init_object(s, object, SLUB_RED_INACTIVE);
@@ -2936,6 +2963,7 @@ redo:
if (!freelist) {
c->slab = NULL;
+ c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
@@ -2968,6 +2996,7 @@ deactivate_slab:
freelist = c->freelist;
c->slab = NULL;
c->freelist = NULL;
+ c->tid = next_tid(c->tid);
local_unlock_irqrestore(&s->cpu_slab->lock, flags);
deactivate_slab(s, slab, freelist);
@@ -3228,7 +3257,7 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
{
void *ret = slab_alloc(s, lru, gfpflags, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size,
+ trace_kmem_cache_alloc(_RET_IP_, ret, s, s->object_size,
s->size, gfpflags);
return ret;
@@ -3251,7 +3280,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_lru);
void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, NULL, gfpflags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ trace_kmalloc(_RET_IP_, ret, s, size, s->size, gfpflags);
ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
@@ -3263,7 +3292,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size);
- trace_kmem_cache_alloc_node(_RET_IP_, ret,
+ trace_kmem_cache_alloc_node(_RET_IP_, ret, s,
s->object_size, s->size, gfpflags, node);
return ret;
@@ -3277,7 +3306,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
{
void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(_RET_IP_, ret, s,
size, s->size, gfpflags, node);
ret = kasan_kmalloc(s, ret, size, gfpflags);
@@ -3435,9 +3464,6 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long tid;
- /* memcg_slab_free_hook() is already called for bulk free. */
- if (!tail)
- memcg_slab_free_hook(s, &head, 1);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3497,9 +3523,10 @@ redo:
}
static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab,
- void *head, void *tail, int cnt,
+ void *head, void *tail, void **p, int cnt,
unsigned long addr)
{
+ memcg_slab_free_hook(s, slab, p, cnt);
/*
* With KASAN enabled slab_free_freelist_hook modifies the freelist
* to remove objects, whose reuse must be delayed.
@@ -3521,7 +3548,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
if (!s)
return;
trace_kmem_cache_free(_RET_IP_, x, s->name);
- slab_free(s, virt_to_slab(x), x, NULL, 1, _RET_IP_);
+ slab_free(s, virt_to_slab(x), x, NULL, &x, 1, _RET_IP_);
}
EXPORT_SYMBOL(kmem_cache_free);
@@ -3562,88 +3589,67 @@ static inline
int build_detached_freelist(struct kmem_cache *s, size_t size,
void **p, struct detached_freelist *df)
{
- size_t first_skipped_index = 0;
int lookahead = 3;
void *object;
struct folio *folio;
- struct slab *slab;
-
- /* Always re-init detached_freelist */
- df->slab = NULL;
-
- do {
- object = p[--size];
- /* Do we need !ZERO_OR_NULL_PTR(object) here? (for kfree) */
- } while (!object && size);
-
- if (!object)
- return 0;
+ size_t same;
+ object = p[--size];
folio = virt_to_folio(object);
if (!s) {
/* Handle kalloc'ed objects */
if (unlikely(!folio_test_slab(folio))) {
free_large_kmalloc(folio, object);
- p[size] = NULL; /* mark object processed */
+ df->slab = NULL;
return size;
}
/* Derive kmem_cache from object */
- slab = folio_slab(folio);
- df->s = slab->slab_cache;
+ df->slab = folio_slab(folio);
+ df->s = df->slab->slab_cache;
} else {
- slab = folio_slab(folio);
+ df->slab = folio_slab(folio);
df->s = cache_from_obj(s, object); /* Support for memcg */
}
- if (is_kfence_address(object)) {
- slab_free_hook(df->s, object, false);
- __kfence_free(object);
- p[size] = NULL; /* mark object processed */
- return size;
- }
-
/* Start new detached freelist */
- df->slab = slab;
- set_freepointer(df->s, object, NULL);
df->tail = object;
df->freelist = object;
- p[size] = NULL; /* mark object processed */
df->cnt = 1;
+ if (is_kfence_address(object))
+ return size;
+
+ set_freepointer(df->s, object, NULL);
+
+ same = size;
while (size) {
object = p[--size];
- if (!object)
- continue; /* Skip processed objects */
-
/* df->slab is always set at this point */
if (df->slab == virt_to_slab(object)) {
/* Opportunity build freelist */
set_freepointer(df->s, object, df->freelist);
df->freelist = object;
df->cnt++;
- p[size] = NULL; /* mark object processed */
-
+ same--;
+ if (size != same)
+ swap(p[size], p[same]);
continue;
}
/* Limit look ahead search */
if (!--lookahead)
break;
-
- if (!first_skipped_index)
- first_skipped_index = size + 1;
}
- return first_skipped_index;
+ return same;
}
/* Note that interrupts must be enabled when calling this function. */
void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
{
- if (WARN_ON(!size))
+ if (!size)
return;
- memcg_slab_free_hook(s, p, size);
do {
struct detached_freelist df;
@@ -3651,7 +3657,8 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
if (!df.slab)
continue;
- slab_free(df.s, df.slab, df.freelist, df.tail, df.cnt, _RET_IP_);
+ slab_free(df.s, df.slab, df.freelist, df.tail, &p[size], df.cnt,
+ _RET_IP_);
} while (likely(size));
}
EXPORT_SYMBOL(kmem_cache_free_bulk);
@@ -3731,7 +3738,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
error:
slub_put_cpu_ptr(s->cpu_slab);
slab_post_alloc_hook(s, objcg, flags, i, p, false);
- __kmem_cache_free_bulk(s, i, p);
+ kmem_cache_free_bulk(s, i, p);
return 0;
}
EXPORT_SYMBOL(kmem_cache_alloc_bulk);
@@ -4412,7 +4419,7 @@ void *__kmalloc(size_t size, gfp_t flags)
ret = slab_alloc(s, NULL, flags, _RET_IP_, size);
- trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+ trace_kmalloc(_RET_IP_, ret, s, size, s->size, flags);
ret = kasan_kmalloc(s, ret, size, flags);
@@ -4446,7 +4453,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
ret = kmalloc_large_node(size, flags, node);
- trace_kmalloc_node(_RET_IP_, ret,
+ trace_kmalloc_node(_RET_IP_, ret, NULL,
size, PAGE_SIZE << get_order(size),
flags, node);
@@ -4460,7 +4467,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
ret = slab_alloc_node(s, NULL, flags, node, _RET_IP_, size);
- trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+ trace_kmalloc_node(_RET_IP_, ret, s, size, s->size, flags, node);
ret = kasan_kmalloc(s, ret, size, flags);
@@ -4552,7 +4559,7 @@ void kfree(const void *x)
return;
}
slab = folio_slab(folio);
- slab_free(slab->slab_cache, slab, object, NULL, 1, _RET_IP_);
+ slab_free(slab->slab_cache, slab, object, NULL, &object, 1, _RET_IP_);
}
EXPORT_SYMBOL(kfree);
@@ -4861,6 +4868,9 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
+ if (sysfs_slab_alias(s, name))
+ return NULL;
+
s->refcount++;
/*
@@ -4869,11 +4879,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
*/
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
-
- if (sysfs_slab_alias(s, name)) {
- s->refcount--;
- s = NULL;
- }
}
return s;
@@ -4919,7 +4924,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
ret = slab_alloc(s, NULL, gfpflags, caller, size);
/* Honor the call site pointer we received. */
- trace_kmalloc(caller, ret, size, s->size, gfpflags);
+ trace_kmalloc(caller, ret, s, size, s->size, gfpflags);
return ret;
}
@@ -4935,7 +4940,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
ret = kmalloc_large_node(size, gfpflags, node);
- trace_kmalloc_node(caller, ret,
+ trace_kmalloc_node(caller, ret, NULL,
size, PAGE_SIZE << get_order(size),
gfpflags, node);
@@ -4950,7 +4955,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
ret = slab_alloc_node(s, NULL, gfpflags, node, caller, size);
/* Honor the call site pointer we received. */
- trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
+ trace_kmalloc_node(caller, ret, s, size, s->size, gfpflags, node);
return ret;
}
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index f4fa61dbbee3..5f0ed4717ed0 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -78,6 +78,14 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_leaf(*pmd))) {
+ /*
+ * Higher order allocations from buddy allocator must be able to
+ * be treated as indepdenent small pages (as they can be freed
+ * individually).
+ */
+ if (!PageReserved(page))
+ split_page(page, get_order(PMD_SIZE));
+
/* Make pte visible before pmd. See comment in pmd_install(). */
smp_wmb();
pmd_populate_kernel(&init_mm, pmd, pgtable);
@@ -200,8 +208,8 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end,
unsigned long next;
pgd_t *pgd;
- VM_BUG_ON(!IS_ALIGNED(start, PAGE_SIZE));
- VM_BUG_ON(!IS_ALIGNED(end, PAGE_SIZE));
+ VM_BUG_ON(!PAGE_ALIGNED(start));
+ VM_BUG_ON(!PAGE_ALIGNED(end));
pgd = pgd_offset_k(addr);
do {
@@ -528,7 +536,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
int actual_node = early_pfn_to_nid(pfn);
if (node_distance(actual_node, node) > LOCAL_DISTANCE)
- pr_warn("[%lx-%lx] potential offnode page_structs\n",
+ pr_warn_once("[%lx-%lx] potential offnode page_structs\n",
start, end - 1);
}
@@ -548,7 +556,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
} else {
/*
* When a PTE/PMD entry is freed from the init_mm
- * there's a a free_pages() call to this page allocated
+ * there's a free_pages() call to this page allocated
* above. Thus this get_page() is paired with the
* put_page_testzero() on the freeing path.
* This can only called by certain ZONE_DEVICE path,
@@ -737,7 +745,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
for (addr = start; addr < end; addr += size) {
- unsigned long next = addr, last = addr + size;
+ unsigned long next, last = addr + size;
/* Populate the head page vmemmap page */
pte = vmemmap_populate_address(addr, node, NULL, NULL);
@@ -752,7 +760,7 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
/*
* Reuse the previous page for the rest of tail pages
- * See layout diagram in Documentation/vm/vmemmap_dedup.rst
+ * See layout diagram in Documentation/mm/vmemmap_dedup.rst
*/
next += PAGE_SIZE;
rc = vmemmap_populate_range(next, last, node, NULL,
diff --git a/mm/sparse.c b/mm/sparse.c
index cb3bfae64036..e5a8a3a0edd7 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -281,7 +281,7 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
{
unsigned long coded_mem_map =
(unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
- BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL<<PFN_SECTION_SHIFT));
+ BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
return coded_mem_map;
}
diff --git a/mm/swap.c b/mm/swap.c
index f3922a96b2e9..9cee7f6a3809 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -46,30 +46,30 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-/* Protecting only lru_rotate.pvec which requires disabling interrupts */
+/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
struct lru_rotate {
local_lock_t lock;
- struct pagevec pvec;
+ struct folio_batch fbatch;
};
static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/*
- * The following struct pagevec are grouped together because they are protected
+ * The following folio batches are grouped together because they are protected
* by disabling preemption (and interrupts remain enabled).
*/
-struct lru_pvecs {
+struct cpu_fbatches {
local_lock_t lock;
- struct pagevec lru_add;
- struct pagevec lru_deactivate_file;
- struct pagevec lru_deactivate;
- struct pagevec lru_lazyfree;
+ struct folio_batch lru_add;
+ struct folio_batch lru_deactivate_file;
+ struct folio_batch lru_deactivate;
+ struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
- struct pagevec activate_page;
+ struct folio_batch activate;
#endif
};
-static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
+static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
.lock = INIT_LOCAL_LOCK(lock),
};
@@ -77,36 +77,35 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
* This path almost never happens for VM activity - pages are normally freed
* via pagevecs. But it gets used by networking - and for compound pages.
*/
-static void __page_cache_release(struct page *page)
+static void __page_cache_release(struct folio *folio)
{
- if (PageLRU(page)) {
- struct folio *folio = page_folio(page);
+ if (folio_test_lru(folio)) {
struct lruvec *lruvec;
unsigned long flags;
lruvec = folio_lruvec_lock_irqsave(folio, &flags);
- del_page_from_lru_list(page, lruvec);
- __clear_page_lru_flags(page);
+ lruvec_del_folio(lruvec, folio);
+ __folio_clear_lru_flags(folio);
unlock_page_lruvec_irqrestore(lruvec, flags);
}
- /* See comment on PageMlocked in release_pages() */
- if (unlikely(PageMlocked(page))) {
- int nr_pages = thp_nr_pages(page);
+ /* See comment on folio_test_mlocked in release_pages() */
+ if (unlikely(folio_test_mlocked(folio))) {
+ long nr_pages = folio_nr_pages(folio);
- __ClearPageMlocked(page);
- mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ __folio_clear_mlocked(folio);
+ zone_stat_mod_folio(folio, NR_MLOCK, -nr_pages);
count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
}
}
-static void __put_single_page(struct page *page)
+static void __folio_put_small(struct folio *folio)
{
- __page_cache_release(page);
- mem_cgroup_uncharge(page_folio(page));
- free_unref_page(page, 0);
+ __page_cache_release(folio);
+ mem_cgroup_uncharge(folio);
+ free_unref_page(&folio->page, 0);
}
-static void __put_compound_page(struct page *page)
+static void __folio_put_large(struct folio *folio)
{
/*
* __page_cache_release() is supposed to be called for thp, not for
@@ -114,21 +113,21 @@ static void __put_compound_page(struct page *page)
* (it's never listed to any LRU lists) and no memcg routines should
* be called for hugetlb (it has a separate hugetlb_cgroup.)
*/
- if (!PageHuge(page))
- __page_cache_release(page);
- destroy_compound_page(page);
+ if (!folio_test_hugetlb(folio))
+ __page_cache_release(folio);
+ destroy_large_folio(folio);
}
-void __put_page(struct page *page)
+void __folio_put(struct folio *folio)
{
- if (unlikely(is_zone_device_page(page)))
- free_zone_device_page(page);
- else if (unlikely(PageCompound(page)))
- __put_compound_page(page);
+ if (unlikely(folio_is_zone_device(folio)))
+ free_zone_device_page(&folio->page);
+ else if (unlikely(folio_test_large(folio)))
+ __folio_put_large(folio);
else
- __put_single_page(page);
+ __folio_put_small(folio);
}
-EXPORT_SYMBOL(__put_page);
+EXPORT_SYMBOL(__folio_put);
/**
* put_pages_list() - release a list of pages
@@ -138,19 +137,19 @@ EXPORT_SYMBOL(__put_page);
*/
void put_pages_list(struct list_head *pages)
{
- struct page *page, *next;
+ struct folio *folio, *next;
- list_for_each_entry_safe(page, next, pages, lru) {
- if (!put_page_testzero(page)) {
- list_del(&page->lru);
+ list_for_each_entry_safe(folio, next, pages, lru) {
+ if (!folio_put_testzero(folio)) {
+ list_del(&folio->lru);
continue;
}
- if (PageHead(page)) {
- list_del(&page->lru);
- __put_compound_page(page);
+ if (folio_test_large(folio)) {
+ list_del(&folio->lru);
+ __folio_put_large(folio);
continue;
}
- /* Cannot be PageLRU because it's passed to us using the lru */
+ /* LRU flag must be clear because it's passed using the lru */
}
free_unref_page_list(pages);
@@ -188,36 +187,84 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
}
EXPORT_SYMBOL_GPL(get_kernel_pages);
-static void pagevec_lru_move_fn(struct pagevec *pvec,
- void (*move_fn)(struct page *page, struct lruvec *lruvec))
+typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
+
+static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
+{
+ int was_unevictable = folio_test_clear_unevictable(folio);
+ long nr_pages = folio_nr_pages(folio);
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+
+ /*
+ * Is an smp_mb__after_atomic() still required here, before
+ * folio_evictable() tests the mlocked flag, to rule out the possibility
+ * of stranding an evictable folio on an unevictable LRU? I think
+ * not, because __munlock_page() only clears the mlocked flag
+ * while the LRU lock is held.
+ *
+ * (That is not true of __page_cache_release(), and not necessarily
+ * true of release_pages(): but those only clear the mlocked flag after
+ * folio_put_testzero() has excluded any other users of the folio.)
+ */
+ if (folio_evictable(folio)) {
+ if (was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
+ } else {
+ folio_clear_active(folio);
+ folio_set_unevictable(folio);
+ /*
+ * folio->mlock_count = !!folio_test_mlocked(folio)?
+ * But that leaves __mlock_page() in doubt whether another
+ * actor has already counted the mlock or not. Err on the
+ * safe side, underestimate, let page reclaim fix it, rather
+ * than leaving a page on the unevictable LRU indefinitely.
+ */
+ folio->mlock_count = 0;
+ if (!was_unevictable)
+ __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
+ }
+
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_insertion(folio);
+}
+
+static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
{
int i;
struct lruvec *lruvec = NULL;
unsigned long flags = 0;
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct page *page = pvec->pages[i];
- struct folio *folio = page_folio(page);
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- /* block memcg migration during page moving between lru */
- if (!TestClearPageLRU(page))
+ /* block memcg migration while the folio moves between lru */
+ if (move_fn != lru_add_fn && !folio_test_clear_lru(folio))
continue;
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
- (*move_fn)(page, lruvec);
+ move_fn(lruvec, folio);
- SetPageLRU(page);
+ folio_set_lru(folio);
}
+
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
- release_pages(pvec->pages, pvec->nr);
- pagevec_reinit(pvec);
+ folios_put(fbatch->folios, folio_batch_count(fbatch));
+ folio_batch_init(fbatch);
}
-static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
+static void folio_batch_add_and_move(struct folio_batch *fbatch,
+ struct folio *folio, move_fn_t move_fn)
{
- struct folio *folio = page_folio(page);
+ if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
+ !lru_cache_disabled())
+ return;
+ folio_batch_move_lru(fbatch, move_fn);
+}
+static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
+{
if (!folio_test_unevictable(folio)) {
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
@@ -226,18 +273,6 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
}
}
-/* return true if pagevec needs to drain */
-static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
-{
- bool ret = false;
-
- if (!pagevec_add(pvec, page) || PageCompound(page) ||
- lru_cache_disabled())
- ret = true;
-
- return ret;
-}
-
/*
* Writeback is about to end against a folio which has been marked for
* immediate reclaim. If it still appears to be reclaimable, move it
@@ -249,14 +284,13 @@ void folio_rotate_reclaimable(struct folio *folio)
{
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
!folio_test_unevictable(folio) && folio_test_lru(folio)) {
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
unsigned long flags;
folio_get(folio);
local_lock_irqsave(&lru_rotate.lock, flags);
- pvec = this_cpu_ptr(&lru_rotate.pvec);
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
+ fbatch = this_cpu_ptr(&lru_rotate.fbatch);
+ folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
@@ -307,7 +341,7 @@ void lru_note_cost_folio(struct folio *folio)
folio_nr_pages(folio));
}
-static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
+static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
{
if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
long nr_pages = folio_nr_pages(folio);
@@ -324,41 +358,30 @@ static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
}
#ifdef CONFIG_SMP
-static void __activate_page(struct page *page, struct lruvec *lruvec)
+static void folio_activate_drain(int cpu)
{
- return __folio_activate(page_folio(page), lruvec);
-}
-
-static void activate_page_drain(int cpu)
-{
- struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
-
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, __activate_page);
-}
+ struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu);
-static bool need_activate_page_drain(int cpu)
-{
- return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, folio_activate_fn);
}
static void folio_activate(struct folio *folio)
{
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
folio_get(folio);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.activate_page);
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- pagevec_lru_move_fn(pvec, __activate_page);
- local_unlock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.activate);
+ folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
#else
-static inline void activate_page_drain(int cpu)
+static inline void folio_activate_drain(int cpu)
{
}
@@ -368,7 +391,7 @@ static void folio_activate(struct folio *folio)
if (folio_test_clear_lru(folio)) {
lruvec = folio_lruvec_lock_irq(folio);
- __folio_activate(folio, lruvec);
+ folio_activate_fn(lruvec, folio);
unlock_page_lruvec_irq(lruvec);
folio_set_lru(folio);
}
@@ -377,32 +400,32 @@ static void folio_activate(struct folio *folio)
static void __lru_cache_activate_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
int i;
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
/*
- * Search backwards on the optimistic assumption that the page being
- * activated has just been added to this pagevec. Note that only
- * the local pagevec is examined as a !PageLRU page could be in the
+ * Search backwards on the optimistic assumption that the folio being
+ * activated has just been added to this batch. Note that only
+ * the local batch is examined as a !LRU folio could be in the
* process of being released, reclaimed, migrated or on a remote
- * pagevec that is currently being drained. Furthermore, marking
- * a remote pagevec's page PageActive potentially hits a race where
- * a page is marked PageActive just after it is added to the inactive
+ * batch that is currently being drained. Furthermore, marking
+ * a remote batch's folio active potentially hits a race where
+ * a folio is marked active just after it is added to the inactive
* list causing accounting errors and BUG_ON checks to trigger.
*/
- for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
- struct page *pagevec_page = pvec->pages[i];
+ for (i = folio_batch_count(fbatch) - 1; i >= 0; i--) {
+ struct folio *batch_folio = fbatch->folios[i];
- if (pagevec_page == &folio->page) {
+ if (batch_folio == folio) {
folio_set_active(folio);
break;
}
}
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
}
/*
@@ -427,9 +450,9 @@ void folio_mark_accessed(struct folio *folio)
*/
} else if (!folio_test_active(folio)) {
/*
- * If the page is on the LRU, queue it for activation via
- * lru_pvecs.activate_page. Otherwise, assume the page is on a
- * pagevec, mark it active and it'll be moved to the active
+ * If the folio is on the LRU, queue it for activation via
+ * cpu_fbatches.activate. Otherwise, assume the folio is in a
+ * folio_batch, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
if (folio_test_lru(folio))
@@ -450,22 +473,22 @@ EXPORT_SYMBOL(folio_mark_accessed);
*
* Queue the folio for addition to the LRU. The decision on whether
* to add the page to the [in]active [file|anon] list is deferred until the
- * pagevec is drained. This gives a chance for the caller of folio_add_lru()
+ * folio_batch is drained. This gives a chance for the caller of folio_add_lru()
* have the folio added to the active list using folio_mark_accessed().
*/
void folio_add_lru(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_active(folio) &&
+ folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
folio_get(folio);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_add);
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- __pagevec_lru_add(pvec);
- local_unlock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
+ folio_batch_add_and_move(fbatch, folio, lru_add_fn);
+ local_unlock(&cpu_fbatches.lock);
}
EXPORT_SYMBOL(folio_add_lru);
@@ -489,56 +512,57 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
}
/*
- * If the page can not be invalidated, it is moved to the
+ * If the folio cannot be invalidated, it is moved to the
* inactive list to speed up its reclaim. It is moved to the
* head of the list, rather than the tail, to give the flusher
* threads some time to write it out, as this is much more
* effective than the single-page writeout from reclaim.
*
- * If the page isn't page_mapped and dirty/writeback, the page
- * could reclaim asap using PG_reclaim.
+ * If the folio isn't mapped and dirty/writeback, the folio
+ * could be reclaimed asap using the reclaim flag.
*
- * 1. active, mapped page -> none
- * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
- * 3. inactive, mapped page -> none
- * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 1. active, mapped folio -> none
+ * 2. active, dirty/writeback folio -> inactive, head, reclaim
+ * 3. inactive, mapped folio -> none
+ * 4. inactive, dirty/writeback folio -> inactive, head, reclaim
* 5. inactive, clean -> inactive, tail
* 6. Others -> none
*
- * In 4, why it moves inactive's head, the VM expects the page would
- * be write it out by flusher threads as this is much more effective
+ * In 4, it moves to the head of the inactive list so the folio is
+ * written out by flusher threads as this is much more efficient
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
+static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
{
- bool active = PageActive(page);
- int nr_pages = thp_nr_pages(page);
+ bool active = folio_test_active(folio);
+ long nr_pages = folio_nr_pages(folio);
- if (PageUnevictable(page))
+ if (folio_test_unevictable(folio))
return;
- /* Some processes are using the page */
- if (page_mapped(page))
+ /* Some processes are using the folio */
+ if (folio_mapped(folio))
return;
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- ClearPageReferenced(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
- if (PageWriteback(page) || PageDirty(page)) {
+ if (folio_test_writeback(folio) || folio_test_dirty(folio)) {
/*
- * PG_reclaim could be raced with end_page_writeback
- * It can make readahead confusing. But race window
- * is _really_ small and it's non-critical problem.
+ * Setting the reclaim flag could race with
+ * folio_end_writeback() and confuse readahead. But the
+ * race window is _really_ small and it's not a critical
+ * problem.
*/
- add_page_to_lru_list(page, lruvec);
- SetPageReclaim(page);
+ lruvec_add_folio(lruvec, folio);
+ folio_set_reclaim(folio);
} else {
/*
- * The page's writeback ends up during pagevec
- * We move that page into tail of inactive.
+ * The folio's writeback ended while it was in the batch.
+ * We move that folio to the tail of the inactive list.
*/
- add_page_to_lru_list_tail(page, lruvec);
+ lruvec_add_folio_tail(lruvec, folio);
__count_vm_events(PGROTATED, nr_pages);
}
@@ -549,15 +573,15 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
}
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
+static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageActive(page) && !PageUnevictable(page)) {
- int nr_pages = thp_nr_pages(page);
+ if (folio_test_active(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(PGDEACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
@@ -565,22 +589,22 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
}
}
-static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
+static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio)
{
- if (PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page) && !PageUnevictable(page)) {
- int nr_pages = thp_nr_pages(page);
+ if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
+ long nr_pages = folio_nr_pages(folio);
- del_page_from_lru_list(page, lruvec);
- ClearPageActive(page);
- ClearPageReferenced(page);
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
/*
- * Lazyfree pages are clean anonymous pages. They have
- * PG_swapbacked flag cleared, to distinguish them from normal
- * anonymous pages
+ * Lazyfree folios are clean anonymous folios. They have
+ * the swapbacked flag cleared, to distinguish them from normal
+ * anonymous folios
*/
- ClearPageSwapBacked(page);
- add_page_to_lru_list(page, lruvec);
+ folio_clear_swapbacked(folio);
+ lruvec_add_folio(lruvec, folio);
__count_vm_events(PGLAZYFREE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
@@ -589,71 +613,67 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
}
/*
- * Drain pages out of the cpu's pagevecs.
+ * Drain pages out of the cpu's folio_batch.
* Either "cpu" is the current CPU, and preemption has already been
* disabled; or "cpu" is being hot-unplugged, and is already dead.
*/
void lru_add_drain_cpu(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+ struct folio_batch *fbatch = &fbatches->lru_add;
- if (pagevec_count(pvec))
- __pagevec_lru_add(pvec);
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_add_fn);
- pvec = &per_cpu(lru_rotate.pvec, cpu);
+ fbatch = &per_cpu(lru_rotate.fbatch, cpu);
/* Disabling interrupts below acts as a compiler barrier. */
- if (data_race(pagevec_count(pvec))) {
+ if (data_race(folio_batch_count(fbatch))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
local_lock_irqsave(&lru_rotate.lock, flags);
- pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
+ folio_batch_move_lru(fbatch, lru_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
- pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
+ fbatch = &fbatches->lru_deactivate_file;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_deactivate_file_fn);
- pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn);
+ fbatch = &fbatches->lru_deactivate;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_deactivate_fn);
- pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
- if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
+ fbatch = &fbatches->lru_lazyfree;
+ if (folio_batch_count(fbatch))
+ folio_batch_move_lru(fbatch, lru_lazyfree_fn);
- activate_page_drain(cpu);
+ folio_activate_drain(cpu);
}
/**
- * deactivate_file_folio() - Forcefully deactivate a file folio.
+ * deactivate_file_folio() - Deactivate a file folio.
* @folio: Folio to deactivate.
*
* This function hints to the VM that @folio is a good reclaim candidate,
* for example if its invalidation fails due to the folio being dirty
* or under writeback.
*
- * Context: Caller holds a reference on the page.
+ * Context: Caller holds a reference on the folio.
*/
void deactivate_file_folio(struct folio *folio)
{
- struct pagevec *pvec;
+ struct folio_batch *fbatch;
- /*
- * In a workload with many unevictable pages such as mprotect,
- * unevictable folio deactivation for accelerating reclaim is pointless.
- */
+ /* Deactivating an unevictable folio will not accelerate reclaim */
if (folio_test_unevictable(folio))
return;
folio_get(folio);
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
-
- if (pagevec_add_and_need_flush(pvec, &folio->page))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
- local_unlock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
+ folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
+ local_unlock(&cpu_fbatches.lock);
}
/*
@@ -666,15 +686,17 @@ void deactivate_file_folio(struct folio *folio)
*/
void deactivate_page(struct page *page)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
- get_page(page);
- if (pagevec_add_and_need_flush(pvec, page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn);
- local_unlock(&lru_pvecs.lock);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_lru(folio) && folio_test_active(folio) &&
+ !folio_test_unevictable(folio)) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
+ folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
@@ -687,24 +709,26 @@ void deactivate_page(struct page *page)
*/
void mark_page_lazyfree(struct page *page)
{
- if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
- !PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec;
-
- local_lock(&lru_pvecs.lock);
- pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
- get_page(page);
- if (pagevec_add_and_need_flush(pvec, page))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
- local_unlock(&lru_pvecs.lock);
+ struct folio *folio = page_folio(page);
+
+ if (folio_test_lru(folio) && folio_test_anon(folio) &&
+ folio_test_swapbacked(folio) && !folio_test_swapcache(folio) &&
+ !folio_test_unevictable(folio)) {
+ struct folio_batch *fbatch;
+
+ folio_get(folio);
+ local_lock(&cpu_fbatches.lock);
+ fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
+ folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
+ local_unlock(&cpu_fbatches.lock);
}
}
void lru_add_drain(void)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
mlock_page_drain_local();
}
@@ -716,19 +740,19 @@ void lru_add_drain(void)
*/
static void lru_add_and_bh_lrus_drain(void)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
invalidate_bh_lrus_cpu();
mlock_page_drain_local();
}
void lru_add_drain_cpu_zone(struct zone *zone)
{
- local_lock(&lru_pvecs.lock);
+ local_lock(&cpu_fbatches.lock);
lru_add_drain_cpu(smp_processor_id());
drain_local_pages(zone);
- local_unlock(&lru_pvecs.lock);
+ local_unlock(&cpu_fbatches.lock);
mlock_page_drain_local();
}
@@ -741,6 +765,21 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy)
lru_add_and_bh_lrus_drain();
}
+static bool cpu_needs_drain(unsigned int cpu)
+{
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+
+ /* Check these in order of likelihood that they're not zero */
+ return folio_batch_count(&fbatches->lru_add) ||
+ data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) ||
+ folio_batch_count(&fbatches->lru_deactivate_file) ||
+ folio_batch_count(&fbatches->lru_deactivate) ||
+ folio_batch_count(&fbatches->lru_lazyfree) ||
+ folio_batch_count(&fbatches->activate) ||
+ need_mlock_page_drain(cpu) ||
+ has_bh_in_lru(cpu, NULL);
+}
+
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -773,8 +812,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
return;
/*
- * Guarantee pagevec counter stores visible by this CPU are visible to
- * other CPUs before loading the current drain generation.
+ * Guarantee folio_batch counter stores visible by this CPU
+ * are visible to other CPUs before loading the current drain
+ * generation.
*/
smp_mb();
@@ -800,8 +840,9 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
* (D) Increment global generation number
*
* Pairs with smp_load_acquire() at (B), outside of the critical
- * section. Use a full memory barrier to guarantee that the new global
- * drain generation number is stored before loading pagevec counters.
+ * section. Use a full memory barrier to guarantee that the
+ * new global drain generation number is stored before loading
+ * folio_batch counters.
*
* This pairing must be done here, before the for_each_online_cpu loop
* below which drains the page vectors.
@@ -823,14 +864,7 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
- data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
- pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
- need_activate_page_drain(cpu) ||
- need_mlock_page_drain(cpu) ||
- has_bh_in_lru(cpu, NULL)) {
+ if (cpu_needs_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
__cpumask_set_cpu(cpu, &has_work);
@@ -881,7 +915,7 @@ void lru_cache_disable(void)
* lru_disable_count = 0 will have exited the critical
* section when synchronize_rcu() returns.
*/
- synchronize_rcu();
+ synchronize_rcu_expedited();
#ifdef CONFIG_SMP
__lru_add_drain_all(true);
#else
@@ -906,8 +940,7 @@ void release_pages(struct page **pages, int nr)
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
- struct page *page = pages[i];
- struct folio *folio = page_folio(page);
+ struct folio *folio = page_folio(pages[i]);
/*
* Make sure the IRQ-safe lock-holding time does not get
@@ -919,35 +952,34 @@ void release_pages(struct page **pages, int nr)
lruvec = NULL;
}
- page = &folio->page;
- if (is_huge_zero_page(page))
+ if (is_huge_zero_page(&folio->page))
continue;
- if (is_zone_device_page(page)) {
+ if (folio_is_zone_device(folio)) {
if (lruvec) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- if (put_devmap_managed_page(page))
+ if (put_devmap_managed_page(&folio->page))
continue;
- if (put_page_testzero(page))
- free_zone_device_page(page);
+ if (folio_put_testzero(folio))
+ free_zone_device_page(&folio->page);
continue;
}
- if (!put_page_testzero(page))
+ if (!folio_put_testzero(folio))
continue;
- if (PageCompound(page)) {
+ if (folio_test_large(folio)) {
if (lruvec) {
unlock_page_lruvec_irqrestore(lruvec, flags);
lruvec = NULL;
}
- __put_compound_page(page);
+ __folio_put_large(folio);
continue;
}
- if (PageLRU(page)) {
+ if (folio_test_lru(folio)) {
struct lruvec *prev_lruvec = lruvec;
lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
@@ -955,8 +987,8 @@ void release_pages(struct page **pages, int nr)
if (prev_lruvec != lruvec)
lock_batch = 0;
- del_page_from_lru_list(page, lruvec);
- __clear_page_lru_flags(page);
+ lruvec_del_folio(lruvec, folio);
+ __folio_clear_lru_flags(folio);
}
/*
@@ -965,13 +997,13 @@ void release_pages(struct page **pages, int nr)
* found set here. This does not indicate a problem, unless
* "unevictable_pgs_cleared" appears worryingly large.
*/
- if (unlikely(PageMlocked(page))) {
- __ClearPageMlocked(page);
- dec_zone_page_state(page, NR_MLOCK);
+ if (unlikely(folio_test_mlocked(folio))) {
+ __folio_clear_mlocked(folio);
+ zone_stat_sub_folio(folio, NR_MLOCK);
count_vm_event(UNEVICTABLE_PGCLEARED);
}
- list_add(&page->lru, &pages_to_free);
+ list_add(&folio->lru, &pages_to_free);
}
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
@@ -987,8 +1019,8 @@ EXPORT_SYMBOL(release_pages);
* OK from a correctness point of view but is inefficient - those pages may be
* cache-warm and we want to give them back to the page allocator ASAP.
*
- * So __pagevec_release() will drain those queues here. __pagevec_lru_add()
- * and __pagevec_lru_add_active() call release_pages() directly to avoid
+ * So __pagevec_release() will drain those queues here.
+ * folio_batch_move_lru() calls folios_put() directly to avoid
* mutual recursion.
*/
void __pagevec_release(struct pagevec *pvec)
@@ -1002,69 +1034,6 @@ void __pagevec_release(struct pagevec *pvec)
}
EXPORT_SYMBOL(__pagevec_release);
-static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
-{
- int was_unevictable = folio_test_clear_unevictable(folio);
- long nr_pages = folio_nr_pages(folio);
-
- VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
-
- folio_set_lru(folio);
- /*
- * Is an smp_mb__after_atomic() still required here, before
- * folio_evictable() tests PageMlocked, to rule out the possibility
- * of stranding an evictable folio on an unevictable LRU? I think
- * not, because __munlock_page() only clears PageMlocked while the LRU
- * lock is held.
- *
- * (That is not true of __page_cache_release(), and not necessarily
- * true of release_pages(): but those only clear PageMlocked after
- * put_page_testzero() has excluded any other users of the page.)
- */
- if (folio_evictable(folio)) {
- if (was_unevictable)
- __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
- } else {
- folio_clear_active(folio);
- folio_set_unevictable(folio);
- /*
- * folio->mlock_count = !!folio_test_mlocked(folio)?
- * But that leaves __mlock_page() in doubt whether another
- * actor has already counted the mlock or not. Err on the
- * safe side, underestimate, let page reclaim fix it, rather
- * than leaving a page on the unevictable LRU indefinitely.
- */
- folio->mlock_count = 0;
- if (!was_unevictable)
- __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
- }
-
- lruvec_add_folio(lruvec, folio);
- trace_mm_lru_insertion(folio);
-}
-
-/*
- * Add the passed pages to the LRU, then drop the caller's refcount
- * on them. Reinitialises the caller's pagevec.
- */
-void __pagevec_lru_add(struct pagevec *pvec)
-{
- int i;
- struct lruvec *lruvec = NULL;
- unsigned long flags = 0;
-
- for (i = 0; i < pagevec_count(pvec); i++) {
- struct folio *folio = page_folio(pvec->pages[i]);
-
- lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
- __pagevec_lru_add_fn(folio, lruvec);
- }
- if (lruvec)
- unlock_page_lruvec_irqrestore(lruvec, flags);
- release_pages(pvec->pages, pvec->nr);
- pagevec_reinit(pvec);
-}
-
/**
* folio_batch_remove_exceptionals() - Prune non-folios from a batch.
* @fbatch: The batch to prune
@@ -1086,35 +1055,6 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
fbatch->nr = j;
}
-/**
- * pagevec_lookup_range - gang pagecache lookup
- * @pvec: Where the resulting pages are placed
- * @mapping: The address_space to search
- * @start: The starting page index
- * @end: The final page index
- *
- * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
- * pages in the mapping starting from index @start and upto index @end
- * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a
- * reference against the pages in @pvec.
- *
- * The search returns a group of mapping-contiguous pages with ascending
- * indexes. There may be holes in the indices due to not-present pages. We
- * also update @start to index the next page for the traversal.
- *
- * pagevec_lookup_range() returns the number of pages which were found. If this
- * number is smaller than PAGEVEC_SIZE, the end of specified range has been
- * reached.
- */
-unsigned pagevec_lookup_range(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *start, pgoff_t end)
-{
- pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
- pvec->pages);
- return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range);
-
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
xa_mark_t tag)
diff --git a/mm/swap.h b/mm/swap.h
index 0193797b0c92..17936e068c1c 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -36,12 +36,11 @@ bool add_to_swap(struct folio *folio);
void *get_shadow_from_swap_cache(swp_entry_t entry);
int add_to_swap_cache(struct page *page, swp_entry_t entry,
gfp_t gfp, void **shadowp);
-void __delete_from_swap_cache(struct page *page,
+void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow);
-void delete_from_swap_cache(struct page *page);
+void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
-void free_swap_cache(struct page *page);
struct page *lookup_swap_cache(swp_entry_t entry,
struct vm_area_struct *vma,
unsigned long addr);
@@ -61,9 +60,9 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf);
-static inline unsigned int page_swap_flags(struct page *page)
+static inline unsigned int folio_swap_flags(struct folio *folio)
{
- return page_swap_info(page)->flags;
+ return page_swap_info(&folio->page)->flags;
}
#else /* CONFIG_SWAP */
struct swap_iocb;
@@ -81,10 +80,6 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
return NULL;
}
-static inline void free_swap_cache(struct page *page)
-{
-}
-
static inline void show_swap_cache_info(void)
{
}
@@ -135,12 +130,12 @@ static inline int add_to_swap_cache(struct page *page, swp_entry_t entry,
return -1;
}
-static inline void __delete_from_swap_cache(struct page *page,
+static inline void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow)
{
}
-static inline void delete_from_swap_cache(struct page *page)
+static inline void delete_from_swap_cache(struct folio *folio)
{
}
@@ -149,7 +144,7 @@ static inline void clear_shadow_from_swap_cache(int type, unsigned long begin,
{
}
-static inline unsigned int page_swap_flags(struct page *page)
+static inline unsigned int folio_swap_flags(struct folio *folio)
{
return 0;
}
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 2a65a89b5b4d..10b94d64cc25 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -307,7 +307,7 @@ swp_entry_t folio_alloc_swap(struct folio *folio)
entry.val = 0;
if (folio_test_large(folio)) {
- if (IS_ENABLED(CONFIG_THP_SWAP))
+ if (IS_ENABLED(CONFIG_THP_SWAP) && arch_thp_swp_supported())
get_swap_pages(1, &entry, folio_nr_pages(folio));
goto out;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 778d57d2d92d..e166051566f4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
.writepage = swap_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_MIGRATION
- .migratepage = migrate_page,
+ .migrate_folio = migrate_folio,
#endif
};
@@ -59,24 +59,11 @@ static bool enable_vma_readahead __read_mostly = true;
#define GET_SWAP_RA_VAL(vma) \
(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
-#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
-#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
-
-static struct {
- unsigned long add_total;
- unsigned long del_total;
- unsigned long find_success;
- unsigned long find_total;
-} swap_cache_info;
-
static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
void show_swap_cache_info(void)
{
printk("%lu pages in swap cache\n", total_swapcache_pages());
- printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
- swap_cache_info.add_total, swap_cache_info.del_total,
- swap_cache_info.find_success, swap_cache_info.find_total);
printk("Free swap = %ldkB\n",
get_nr_swap_pages() << (PAGE_SHIFT - 10));
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
@@ -95,7 +82,7 @@ void *get_shadow_from_swap_cache(swp_entry_t entry)
}
/*
- * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * add_to_swap_cache resembles filemap_add_folio on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
int add_to_swap_cache(struct page *page, swp_entry_t entry,
@@ -133,7 +120,6 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry,
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_lruvec_page_state(page, NR_SWAPCACHE, nr);
- ADD_CACHE_INFO(add_total, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
@@ -147,32 +133,32 @@ unlock:
}
/*
- * This must be called only on pages that have
+ * This must be called only on folios that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page,
+void __delete_from_swap_cache(struct folio *folio,
swp_entry_t entry, void *shadow)
{
struct address_space *address_space = swap_address_space(entry);
- int i, nr = thp_nr_pages(page);
+ int i;
+ long nr = folio_nr_pages(folio);
pgoff_t idx = swp_offset(entry);
XA_STATE(xas, &address_space->i_pages, idx);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- VM_BUG_ON_PAGE(PageWriteback(page), page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapcache(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio);
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_PAGE(entry != page, entry);
- set_page_private(page + i, 0);
+ VM_BUG_ON_FOLIO(entry != folio, folio);
+ set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
- ClearPageSwapCache(page);
+ folio_clear_swapcache(folio);
address_space->nrpages -= nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
- __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr);
- ADD_CACHE_INFO(del_total, nr);
+ __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, -nr);
}
/**
@@ -237,22 +223,22 @@ fail:
}
/*
- * This must be called only on pages that have
+ * This must be called only on folios that have
* been verified to be in the swap cache and locked.
- * It will never put the page into the free list,
- * the caller has a reference on the page.
+ * It will never put the folio into the free list,
+ * the caller has a reference on the folio.
*/
-void delete_from_swap_cache(struct page *page)
+void delete_from_swap_cache(struct folio *folio)
{
- swp_entry_t entry = { .val = page_private(page) };
+ swp_entry_t entry = folio_swap_entry(folio);
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page, entry, NULL);
+ __delete_from_swap_cache(folio, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
- put_swap_page(page, entry);
- page_ref_sub(page, thp_nr_pages(page));
+ put_swap_page(&folio->page, entry);
+ folio_ref_sub(folio, folio_nr_pages(folio));
}
void clear_shadow_from_swap_cache(int type, unsigned long begin,
@@ -348,12 +334,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma,
page = find_get_page(swap_address_space(entry), swp_offset(entry));
put_swap_device(si);
- INC_CACHE_INFO(find_total);
if (page) {
bool vma_ra = swap_use_vma_readahead();
bool readahead;
- INC_CACHE_INFO(find_success);
/*
* At the moment, we don't support PG_readahead for anon THP
* so let's bail out rather than confusing the readahead stat.
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a2e66d855b19..1fdccd2f1422 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -695,7 +695,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
si->lowest_bit += nr_entries;
if (end == si->highest_bit)
WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
- si->inuse_pages += nr_entries;
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages + nr_entries);
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
si->highest_bit = 0;
@@ -732,7 +732,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
add_to_avail_list(si);
}
atomic_long_add(nr_entries, &nr_swap_pages);
- si->inuse_pages -= nr_entries;
+ WRITE_ONCE(si->inuse_pages, si->inuse_pages - nr_entries);
if (si->flags & SWP_BLKDEV)
swap_slot_free_notify =
si->bdev->bd_disk->fops->swap_slot_free_notify;
@@ -1568,16 +1568,15 @@ unlock_out:
return ret;
}
-static bool page_swapped(struct page *page)
+static bool folio_swapped(struct folio *folio)
{
swp_entry_t entry;
struct swap_info_struct *si;
- if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
- return page_swapcount(page) != 0;
+ if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio)))
+ return page_swapcount(&folio->page) != 0;
- page = compound_head(page);
- entry.val = page_private(page);
+ entry = folio_swap_entry(folio);
si = _swap_info_get(entry);
if (si)
return swap_page_trans_huge_swapped(si, entry);
@@ -1590,13 +1589,14 @@ static bool page_swapped(struct page *page)
*/
int try_to_free_swap(struct page *page)
{
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ struct folio *folio = page_folio(page);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
- if (!PageSwapCache(page))
+ if (!folio_test_swapcache(folio))
return 0;
- if (PageWriteback(page))
+ if (folio_test_writeback(folio))
return 0;
- if (page_swapped(page))
+ if (folio_swapped(folio))
return 0;
/*
@@ -1617,9 +1617,8 @@ int try_to_free_swap(struct page *page)
if (pm_suspended_storage())
return 0;
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
+ delete_from_swap_cache(folio);
+ folio_set_dirty(folio);
return 1;
}
@@ -2640,7 +2639,7 @@ static int swap_show(struct seq_file *swap, void *v)
}
bytes = si->pages << (PAGE_SHIFT - 10);
- inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+ inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10);
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
@@ -3259,7 +3258,7 @@ void si_swapinfo(struct sysinfo *val)
struct swap_info_struct *si = swap_info[type];
if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
- nr_to_be_unused += si->inuse_pages;
+ nr_to_be_unused += READ_ONCE(si->inuse_pages);
}
val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
val->totalswap = total_swap_pages + nr_to_be_unused;
diff --git a/mm/truncate.c b/mm/truncate.c
index ab50d0d59a2a..0b0708bf935f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -443,7 +443,7 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
* mapping->invalidate_lock.
*
* Note: When this function returns, there can be a page in the process of
- * deletion (inside __delete_from_page_cache()) in the specified range. Thus
+ * deletion (inside __filemap_remove_folio()) in the specified range. Thus
* mapping->nrpages can be non-zero when this function returns even after
* truncation of the whole mapping.
*/
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 4e1da708699b..c1ee15a98633 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -161,7 +161,7 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
static inline void check_heap_object(const void *ptr, unsigned long n,
bool to_user)
{
- uintptr_t addr = (uintptr_t)ptr;
+ unsigned long addr = (unsigned long)ptr;
unsigned long offset;
struct folio *folio;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 4f4892a5f767..07d3befc80e4 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -246,7 +246,10 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
struct page *page;
int ret;
- ret = shmem_getpage(inode, pgoff, &page, SGP_READ);
+ ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC);
+ /* Our caller expects us to return -EFAULT if we failed to find page. */
+ if (ret == -ENOENT)
+ ret = -EFAULT;
if (ret)
goto out;
if (!page) {
diff --git a/mm/util.c b/mm/util.c
index 0837570c9225..c9439c66d8cf 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -804,10 +804,10 @@ struct address_space *folio_mapping(struct folio *folio)
return swap_address_space(folio_swap_entry(folio));
mapping = folio->mapping;
- if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+ if ((unsigned long)mapping & PAGE_MAPPING_FLAGS)
return NULL;
- return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
+ return mapping;
}
EXPORT_SYMBOL(folio_mapping);
@@ -1005,7 +1005,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
* succeed and -ENOMEM implies there is not.
*
* We currently support three overcommit policies, which are set via the
- * vm.overcommit_memory sysctl. See Documentation/vm/overcommit-accounting.rst
+ * vm.overcommit_memory sysctl. See Documentation/mm/overcommit-accounting.rst
*
* Strict overcommit modes added 2002 Feb 26 by Alan Cox.
* Additional code 2002 Jul 20 by Robert Love.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index effd1ff6a4b4..dd6cdb201195 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -790,6 +790,7 @@ unsigned long vmalloc_nr_pages(void)
return atomic_long_read(&nr_vmalloc_pages);
}
+/* Look up the first VA which satisfies addr < va_end, NULL if none. */
static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
{
struct vmap_area *va = NULL;
@@ -814,9 +815,9 @@ static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr)
return va;
}
-static struct vmap_area *__find_vmap_area(unsigned long addr)
+static struct vmap_area *__find_vmap_area(unsigned long addr, struct rb_root *root)
{
- struct rb_node *n = vmap_area_root.rb_node;
+ struct rb_node *n = root->rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
@@ -874,11 +875,9 @@ find_va_links(struct vmap_area *va,
* Trigger the BUG() if there are sides(left/right)
* or full overlaps.
*/
- if (va->va_start < tmp_va->va_end &&
- va->va_end <= tmp_va->va_start)
+ if (va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left;
- else if (va->va_end > tmp_va->va_start &&
- va->va_start >= tmp_va->va_end)
+ else if (va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
else {
WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
@@ -911,8 +910,9 @@ get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
}
static __always_inline void
-link_va(struct vmap_area *va, struct rb_root *root,
- struct rb_node *parent, struct rb_node **link, struct list_head *head)
+__link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head, bool augment)
{
/*
* VA is still not in the list, but we can
@@ -926,12 +926,12 @@ link_va(struct vmap_area *va, struct rb_root *root,
/* Insert to the rb-tree */
rb_link_node(&va->rb_node, parent, link);
- if (root == &free_vmap_area_root) {
+ if (augment) {
/*
* Some explanation here. Just perform simple insertion
* to the tree. We do not set va->subtree_max_size to
* its current size before calling rb_insert_augmented().
- * It is because of we populate the tree from the bottom
+ * It is because we populate the tree from the bottom
* to parent levels when the node _is_ in the tree.
*
* Therefore we set subtree_max_size to zero after insertion,
@@ -950,21 +950,49 @@ link_va(struct vmap_area *va, struct rb_root *root,
}
static __always_inline void
-unlink_va(struct vmap_area *va, struct rb_root *root)
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, false);
+}
+
+static __always_inline void
+link_va_augment(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link,
+ struct list_head *head)
+{
+ __link_va(va, root, parent, link, head, true);
+}
+
+static __always_inline void
+__unlink_va(struct vmap_area *va, struct rb_root *root, bool augment)
{
if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
return;
- if (root == &free_vmap_area_root)
+ if (augment)
rb_erase_augmented(&va->rb_node,
root, &free_vmap_area_rb_augment_cb);
else
rb_erase(&va->rb_node, root);
- list_del(&va->list);
+ list_del_init(&va->list);
RB_CLEAR_NODE(&va->rb_node);
}
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, false);
+}
+
+static __always_inline void
+unlink_va_augment(struct vmap_area *va, struct rb_root *root)
+{
+ __unlink_va(va, root, true);
+}
+
#if DEBUG_AUGMENT_PROPAGATE_CHECK
/*
* Gets called when remove the node and rotate.
@@ -1060,7 +1088,7 @@ insert_vmap_area_augment(struct vmap_area *va,
link = find_va_links(va, root, NULL, &parent);
if (link) {
- link_va(va, root, parent, link, head);
+ link_va_augment(va, root, parent, link, head);
augment_tree_propagate_from(va);
}
}
@@ -1077,8 +1105,8 @@ insert_vmap_area_augment(struct vmap_area *va,
* ongoing.
*/
static __always_inline struct vmap_area *
-merge_or_add_vmap_area(struct vmap_area *va,
- struct rb_root *root, struct list_head *head)
+__merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head, bool augment)
{
struct vmap_area *sibling;
struct list_head *next;
@@ -1140,7 +1168,7 @@ merge_or_add_vmap_area(struct vmap_area *va,
* "normalized" because of rotation operations.
*/
if (merged)
- unlink_va(va, root);
+ __unlink_va(va, root, augment);
sibling->va_end = va->va_end;
@@ -1155,16 +1183,23 @@ merge_or_add_vmap_area(struct vmap_area *va,
insert:
if (!merged)
- link_va(va, root, parent, link, head);
+ __link_va(va, root, parent, link, head, augment);
return va;
}
static __always_inline struct vmap_area *
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ return __merge_or_add_vmap_area(va, root, head, false);
+}
+
+static __always_inline struct vmap_area *
merge_or_add_vmap_area_augment(struct vmap_area *va,
struct rb_root *root, struct list_head *head)
{
- va = merge_or_add_vmap_area(va, root, head);
+ va = __merge_or_add_vmap_area(va, root, head, true);
if (va)
augment_tree_propagate_from(va);
@@ -1198,15 +1233,15 @@ is_within_this_va(struct vmap_area *va, unsigned long size,
* overhead.
*/
static __always_inline struct vmap_area *
-find_vmap_lowest_match(unsigned long size, unsigned long align,
- unsigned long vstart, bool adjust_search_size)
+find_vmap_lowest_match(struct rb_root *root, unsigned long size,
+ unsigned long align, unsigned long vstart, bool adjust_search_size)
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
- node = free_vmap_area_root.rb_node;
+ node = root->rb_node;
/* Adjust the search size for alignment overhead. */
length = adjust_search_size ? size + align - 1 : size;
@@ -1334,11 +1369,12 @@ classify_va_fit_type(struct vmap_area *va,
}
static __always_inline int
-adjust_va_to_fit_type(struct vmap_area *va,
- unsigned long nva_start_addr, unsigned long size,
- enum fit_type type)
+adjust_va_to_fit_type(struct rb_root *root, struct list_head *head,
+ struct vmap_area *va, unsigned long nva_start_addr,
+ unsigned long size)
{
struct vmap_area *lva = NULL;
+ enum fit_type type = classify_va_fit_type(va, nva_start_addr, size);
if (type == FL_FIT_TYPE) {
/*
@@ -1348,7 +1384,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
* V NVA V
* |---------------|
*/
- unlink_va(va, &free_vmap_area_root);
+ unlink_va_augment(va, root);
kmem_cache_free(vmap_area_cachep, va);
} else if (type == LE_FIT_TYPE) {
/*
@@ -1426,8 +1462,7 @@ adjust_va_to_fit_type(struct vmap_area *va,
augment_tree_propagate_from(va);
if (lva) /* type == NE_FIT_TYPE */
- insert_vmap_area_augment(lva, &va->rb_node,
- &free_vmap_area_root, &free_vmap_area_list);
+ insert_vmap_area_augment(lva, &va->rb_node, root, head);
}
return 0;
@@ -1438,13 +1473,13 @@ adjust_va_to_fit_type(struct vmap_area *va,
* Otherwise a vend is returned that indicates failure.
*/
static __always_inline unsigned long
-__alloc_vmap_area(unsigned long size, unsigned long align,
+__alloc_vmap_area(struct rb_root *root, struct list_head *head,
+ unsigned long size, unsigned long align,
unsigned long vstart, unsigned long vend)
{
bool adjust_search_size = true;
unsigned long nva_start_addr;
struct vmap_area *va;
- enum fit_type type;
int ret;
/*
@@ -1459,7 +1494,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
if (align <= PAGE_SIZE || (align > PAGE_SIZE && (vend - vstart) == size))
adjust_search_size = false;
- va = find_vmap_lowest_match(size, align, vstart, adjust_search_size);
+ va = find_vmap_lowest_match(root, size, align, vstart, adjust_search_size);
if (unlikely(!va))
return vend;
@@ -1472,14 +1507,9 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
if (nva_start_addr + size > vend)
return vend;
- /* Classify what we have found. */
- type = classify_va_fit_type(va, nva_start_addr, size);
- if (WARN_ON_ONCE(type == NOTHING_FIT))
- return vend;
-
/* Update the free vmap_area. */
- ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
- if (ret)
+ ret = adjust_va_to_fit_type(root, head, va, nva_start_addr, size);
+ if (WARN_ON_ONCE(ret))
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
@@ -1569,7 +1599,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
retry:
preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
- addr = __alloc_vmap_area(size, align, vstart, vend);
+ addr = __alloc_vmap_area(&free_vmap_area_root, &free_vmap_area_list,
+ size, align, vstart, vend);
spin_unlock(&free_vmap_area_lock);
/*
@@ -1663,7 +1694,7 @@ static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
* Serialize vmap purging. There is no actual critical section protected
- * by this look, but we want to avoid concurrent calls for performance
+ * by this lock, but we want to avoid concurrent calls for performance
* reasons and to make the pcpu_get_vm_areas more deterministic.
*/
static DEFINE_MUTEX(vmap_purge_lock);
@@ -1677,32 +1708,32 @@ static void purge_fragmented_blocks_allcpus(void);
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
- struct list_head local_pure_list;
+ struct list_head local_purge_list;
struct vmap_area *va, *n_va;
lockdep_assert_held(&vmap_purge_lock);
spin_lock(&purge_vmap_area_lock);
purge_vmap_area_root = RB_ROOT;
- list_replace_init(&purge_vmap_area_list, &local_pure_list);
+ list_replace_init(&purge_vmap_area_list, &local_purge_list);
spin_unlock(&purge_vmap_area_lock);
- if (unlikely(list_empty(&local_pure_list)))
+ if (unlikely(list_empty(&local_purge_list)))
return false;
start = min(start,
- list_first_entry(&local_pure_list,
+ list_first_entry(&local_purge_list,
struct vmap_area, list)->va_start);
end = max(end,
- list_last_entry(&local_pure_list,
+ list_last_entry(&local_purge_list,
struct vmap_area, list)->va_end);
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
spin_lock(&free_vmap_area_lock);
- list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
+ list_for_each_entry_safe(va, n_va, &local_purge_list, list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
@@ -1803,7 +1834,7 @@ struct vmap_area *find_vmap_area(unsigned long addr)
struct vmap_area *va;
spin_lock(&vmap_area_lock);
- va = __find_vmap_area(addr);
+ va = __find_vmap_area(addr, &vmap_area_root);
spin_unlock(&vmap_area_lock);
return va;
@@ -2546,7 +2577,7 @@ struct vm_struct *remove_vm_area(const void *addr)
might_sleep();
spin_lock(&vmap_area_lock);
- va = __find_vmap_area((unsigned long)addr);
+ va = __find_vmap_area((unsigned long)addr, &vmap_area_root);
if (va && va->vm) {
struct vm_struct *vm = va->vm;
@@ -3168,15 +3199,15 @@ again:
/*
* Mark the pages as accessible, now that they are mapped.
- * The init condition should match the one in post_alloc_hook()
- * (except for the should_skip_init() check) to make sure that memory
- * is initialized under the same conditions regardless of the enabled
- * KASAN mode.
+ * The condition for setting KASAN_VMALLOC_INIT should complement the
+ * one in post_alloc_hook() with regards to the __GFP_SKIP_ZERO check
+ * to make sure that memory is initialized under the same conditions.
* Tag-based KASAN modes only assign tags to normal non-executable
* allocations, see __kasan_unpoison_vmalloc().
*/
kasan_flags |= KASAN_VMALLOC_VM_ALLOC;
- if (!want_init_on_free() && want_init_on_alloc(gfp_mask))
+ if (!want_init_on_free() && want_init_on_alloc(gfp_mask) &&
+ (gfp_mask & __GFP_SKIP_ZERO))
kasan_flags |= KASAN_VMALLOC_INIT;
/* KASAN_VMALLOC_PROT_NORMAL already set if required. */
area->addr = kasan_unpoison_vmalloc(area->addr, real_size, kasan_flags);
@@ -3735,7 +3766,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
int area, area2, last_area, term_area;
unsigned long base, start, size, end, last_end, orig_start, orig_end;
bool purged = false;
- enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -3846,15 +3876,13 @@ retry:
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- type = classify_va_fit_type(va, start, size);
- if (WARN_ON_ONCE(type == NOTHING_FIT))
+ ret = adjust_va_to_fit_type(&free_vmap_area_root,
+ &free_vmap_area_list,
+ va, start, size);
+ if (WARN_ON_ONCE(unlikely(ret)))
/* It is a BUG(), but trigger recovery instead. */
goto recovery;
- ret = adjust_va_to_fit_type(va, start, size, type);
- if (unlikely(ret))
- goto recovery;
-
/* Allocated area. */
va = vas[area];
va->va_start = start;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7d9a683e3a7..b2b1431352dc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -26,8 +26,7 @@
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
-#include <linux/buffer_head.h> /* for try_to_release_page(),
- buffer_heads_over_limit */
+#include <linux/buffer_head.h> /* for buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
@@ -102,6 +101,9 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* Proactive reclaim invoked by userspace through memory.reclaim */
+ unsigned int proactive:1;
+
/*
* Cgroup memory below memory.low is protected as long as we
* don't threaten to OOM. If any cgroup is reclaimed at
@@ -160,17 +162,17 @@ struct scan_control {
};
#ifdef ARCH_HAS_PREFETCHW
-#define prefetchw_prev_lru_page(_page, _base, _field) \
+#define prefetchw_prev_lru_folio(_folio, _base, _field) \
do { \
- if ((_page)->lru.prev != _base) { \
- struct page *prev; \
+ if ((_folio)->lru.prev != _base) { \
+ struct folio *prev; \
\
- prev = lru_to_page(&(_page->lru)); \
+ prev = lru_to_folio(&(_folio->lru)); \
prefetchw(&prev->_field); \
} \
} while (0)
#else
-#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
+#define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
#endif
/*
@@ -190,8 +192,8 @@ static void set_task_reclaim_state(struct task_struct *task,
task->reclaim_state = rs;
}
-static LIST_HEAD(shrinker_list);
-static DECLARE_RWSEM(shrinker_rwsem);
+LIST_HEAD(shrinker_list);
+DECLARE_RWSEM(shrinker_rwsem);
#ifdef CONFIG_MEMCG
static int shrinker_nr_max;
@@ -608,7 +610,7 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
/*
* Add a shrinker callback to be called from the vm.
*/
-int prealloc_shrinker(struct shrinker *shrinker)
+static int __prealloc_shrinker(struct shrinker *shrinker)
{
unsigned int size;
int err;
@@ -632,8 +634,39 @@ int prealloc_shrinker(struct shrinker *shrinker)
return 0;
}
+#ifdef CONFIG_SHRINKER_DEBUG
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+
+ va_start(ap, fmt);
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!shrinker->name)
+ return -ENOMEM;
+
+ err = __prealloc_shrinker(shrinker);
+ if (err) {
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+ }
+
+ return err;
+}
+#else
+int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ return __prealloc_shrinker(shrinker);
+}
+#endif
+
void free_prealloced_shrinker(struct shrinker *shrinker)
{
+#ifdef CONFIG_SHRINKER_DEBUG
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+#endif
if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
down_write(&shrinker_rwsem);
unregister_memcg_shrinker(shrinker);
@@ -650,18 +683,45 @@ void register_shrinker_prepared(struct shrinker *shrinker)
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED;
+ shrinker_debugfs_add(shrinker);
up_write(&shrinker_rwsem);
}
-int register_shrinker(struct shrinker *shrinker)
+static int __register_shrinker(struct shrinker *shrinker)
{
- int err = prealloc_shrinker(shrinker);
+ int err = __prealloc_shrinker(shrinker);
if (err)
return err;
register_shrinker_prepared(shrinker);
return 0;
}
+
+#ifdef CONFIG_SHRINKER_DEBUG
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ va_list ap;
+ int err;
+
+ va_start(ap, fmt);
+ shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!shrinker->name)
+ return -ENOMEM;
+
+ err = __register_shrinker(shrinker);
+ if (err) {
+ kfree_const(shrinker->name);
+ shrinker->name = NULL;
+ }
+ return err;
+}
+#else
+int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+{
+ return __register_shrinker(shrinker);
+}
+#endif
EXPORT_SYMBOL(register_shrinker);
/*
@@ -677,6 +737,7 @@ void unregister_shrinker(struct shrinker *shrinker)
shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker);
+ shrinker_debugfs_remove(shrinker);
up_write(&shrinker_rwsem);
kfree(shrinker->nr_deferred);
@@ -1276,7 +1337,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
mem_cgroup_swapout(folio, swap);
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- __delete_from_swap_cache(&folio->page, swap, shadow);
+ __delete_from_swap_cache(folio, swap, shadow);
xa_unlock_irq(&mapping->i_pages);
put_swap_page(&folio->page, swap);
} else {
@@ -1519,7 +1580,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
* but that will never affect SWP_FS_OPS, so the data_race
* is safe.
*/
- return !data_race(page_swap_flags(&folio->page) & SWP_FS_OPS);
+ return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
}
/*
@@ -1926,7 +1987,7 @@ free_it:
* appear not as the counts should be low
*/
if (unlikely(folio_test_large(folio)))
- destroy_compound_page(&folio->page);
+ destroy_large_folio(folio);
else
list_add(&folio->lru, &free_pages);
continue;
@@ -1987,7 +2048,7 @@ keep:
}
unsigned int reclaim_clean_pages_from_list(struct zone *zone,
- struct list_head *page_list)
+ struct list_head *folio_list)
{
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
@@ -1995,16 +2056,16 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
};
struct reclaim_stat stat;
unsigned int nr_reclaimed;
- struct page *page, *next;
- LIST_HEAD(clean_pages);
+ struct folio *folio, *next;
+ LIST_HEAD(clean_folios);
unsigned int noreclaim_flag;
- list_for_each_entry_safe(page, next, page_list, lru) {
- if (!PageHuge(page) && page_is_file_lru(page) &&
- !PageDirty(page) && !__PageMovable(page) &&
- !PageUnevictable(page)) {
- ClearPageActive(page);
- list_move(&page->lru, &clean_pages);
+ list_for_each_entry_safe(folio, next, folio_list, lru) {
+ if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
+ !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
+ !folio_test_unevictable(folio)) {
+ folio_clear_active(folio);
+ list_move(&folio->lru, &clean_folios);
}
}
@@ -2015,11 +2076,11 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
* change in the future.
*/
noreclaim_flag = memalloc_noreclaim_save();
- nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+ nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc,
&stat, true);
memalloc_noreclaim_restore(noreclaim_flag);
- list_splice(&clean_pages, page_list);
+ list_splice(&clean_folios, folio_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed);
/*
@@ -2085,72 +2146,72 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
- LIST_HEAD(pages_skipped);
+ LIST_HEAD(folios_skipped);
total_scan = 0;
scan = 0;
while (scan < nr_to_scan && !list_empty(src)) {
struct list_head *move_to = src;
- struct page *page;
+ struct folio *folio;
- page = lru_to_page(src);
- prefetchw_prev_lru_page(page, src, flags);
+ folio = lru_to_folio(src);
+ prefetchw_prev_lru_folio(folio, src, flags);
- nr_pages = compound_nr(page);
+ nr_pages = folio_nr_pages(folio);
total_scan += nr_pages;
- if (page_zonenum(page) > sc->reclaim_idx) {
- nr_skipped[page_zonenum(page)] += nr_pages;
- move_to = &pages_skipped;
+ if (folio_zonenum(folio) > sc->reclaim_idx) {
+ nr_skipped[folio_zonenum(folio)] += nr_pages;
+ move_to = &folios_skipped;
goto move;
}
/*
- * Do not count skipped pages because that makes the function
- * return with no isolated pages if the LRU mostly contains
- * ineligible pages. This causes the VM to not reclaim any
- * pages, triggering a premature OOM.
- * Account all tail pages of THP.
+ * Do not count skipped folios because that makes the function
+ * return with no isolated folios if the LRU mostly contains
+ * ineligible folios. This causes the VM to not reclaim any
+ * folios, triggering a premature OOM.
+ * Account all pages in a folio.
*/
scan += nr_pages;
- if (!PageLRU(page))
+ if (!folio_test_lru(folio))
goto move;
- if (!sc->may_unmap && page_mapped(page))
+ if (!sc->may_unmap && folio_mapped(folio))
goto move;
/*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
+ * Be careful not to clear the lru flag until after we're
+ * sure the folio is not being freed elsewhere -- the
+ * folio release code relies on it.
*/
- if (unlikely(!get_page_unless_zero(page)))
+ if (unlikely(!folio_try_get(folio)))
goto move;
- if (!TestClearPageLRU(page)) {
- /* Another thread is already isolating this page */
- put_page(page);
+ if (!folio_test_clear_lru(folio)) {
+ /* Another thread is already isolating this folio */
+ folio_put(folio);
goto move;
}
nr_taken += nr_pages;
- nr_zone_taken[page_zonenum(page)] += nr_pages;
+ nr_zone_taken[folio_zonenum(folio)] += nr_pages;
move_to = dst;
move:
- list_move(&page->lru, move_to);
+ list_move(&folio->lru, move_to);
}
/*
- * Splice any skipped pages to the start of the LRU list. Note that
+ * Splice any skipped folios to the start of the LRU list. Note that
* this disrupts the LRU order when reclaiming for lower zones but
* we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
- * scanning would soon rescan the same pages to skip and waste lots
+ * scanning would soon rescan the same folios to skip and waste lots
* of cpu cycles.
*/
- if (!list_empty(&pages_skipped)) {
+ if (!list_empty(&folios_skipped)) {
int zid;
- list_splice(&pages_skipped, src);
+ list_splice(&folios_skipped, src);
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
if (!nr_skipped[zid])
continue;
@@ -2254,8 +2315,8 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
}
/*
- * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
- * On return, @list is reused as a list of pages to be freed by the caller.
+ * move_pages_to_lru() moves folios from private @list to appropriate LRU list.
+ * On return, @list is reused as a list of folios to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
@@ -2263,42 +2324,42 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
int nr_pages, nr_moved = 0;
- LIST_HEAD(pages_to_free);
- struct page *page;
+ LIST_HEAD(folios_to_free);
while (!list_empty(list)) {
- page = lru_to_page(list);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
- if (unlikely(!page_evictable(page))) {
+ struct folio *folio = lru_to_folio(list);
+
+ VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
+ list_del(&folio->lru);
+ if (unlikely(!folio_evictable(folio))) {
spin_unlock_irq(&lruvec->lru_lock);
- putback_lru_page(page);
+ folio_putback_lru(folio);
spin_lock_irq(&lruvec->lru_lock);
continue;
}
/*
- * The SetPageLRU needs to be kept here for list integrity.
+ * The folio_set_lru needs to be kept here for list integrity.
* Otherwise:
* #0 move_pages_to_lru #1 release_pages
- * if !put_page_testzero
- * if (put_page_testzero())
- * !PageLRU //skip lru_lock
- * SetPageLRU()
- * list_add(&page->lru,)
- * list_add(&page->lru,)
+ * if (!folio_put_testzero())
+ * if (folio_put_testzero())
+ * !lru //skip lru_lock
+ * folio_set_lru()
+ * list_add(&folio->lru,)
+ * list_add(&folio->lru,)
*/
- SetPageLRU(page);
+ folio_set_lru(folio);
- if (unlikely(put_page_testzero(page))) {
- __clear_page_lru_flags(page);
+ if (unlikely(folio_put_testzero(folio))) {
+ __folio_clear_lru_flags(folio);
- if (unlikely(PageCompound(page))) {
+ if (unlikely(folio_test_large(folio))) {
spin_unlock_irq(&lruvec->lru_lock);
- destroy_compound_page(page);
+ destroy_large_folio(folio);
spin_lock_irq(&lruvec->lru_lock);
} else
- list_add(&page->lru, &pages_to_free);
+ list_add(&folio->lru, &folios_to_free);
continue;
}
@@ -2307,18 +2368,18 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
* All pages were isolated from the same lruvec (and isolation
* inhibits memcg migration).
*/
- VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
- add_page_to_lru_list(page, lruvec);
- nr_pages = thp_nr_pages(page);
+ VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
+ lruvec_add_folio(lruvec, folio);
+ nr_pages = folio_nr_pages(folio);
nr_moved += nr_pages;
- if (PageActive(page))
+ if (folio_test_active(folio))
workingset_age_nonresident(lruvec, nr_pages);
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, list);
+ list_splice(&folios_to_free, list);
return nr_moved;
}
@@ -2429,21 +2490,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
}
/*
- * shrink_active_list() moves pages from the active LRU to the inactive LRU.
+ * shrink_active_list() moves folios from the active LRU to the inactive LRU.
*
- * We move them the other way if the page is referenced by one or more
+ * We move them the other way if the folio is referenced by one or more
* processes.
*
- * If the pages are mostly unmapped, the processing is fast and it is
+ * If the folios are mostly unmapped, the processing is fast and it is
* appropriate to hold lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (folio_referenced()), so
- * we should drop lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
+ * the folios are mapped, the processing is slow (folio_referenced()), so
+ * we should drop lru_lock around each folio. It's impossible to balance
+ * this, so instead we remove the folios from the LRU while processing them.
+ * It is safe to rely on the active flag against the non-LRU folios in here
+ * because nobody will play with that bit on a non-LRU folio.
*
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
+ * The downside is that we have to touch folio->_refcount against each folio.
+ * But we had to alter folio->flags anyway.
*/
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
@@ -2453,7 +2514,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
- LIST_HEAD(l_hold); /* The pages which were snipped off */
+ LIST_HEAD(l_hold); /* The folios which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
unsigned nr_deactivate, nr_activate;
@@ -2478,23 +2539,21 @@ static void shrink_active_list(unsigned long nr_to_scan,
while (!list_empty(&l_hold)) {
struct folio *folio;
- struct page *page;
cond_resched();
folio = lru_to_folio(&l_hold);
list_del(&folio->lru);
- page = &folio->page;
- if (unlikely(!page_evictable(page))) {
- putback_lru_page(page);
+ if (unlikely(!folio_evictable(folio))) {
+ folio_putback_lru(folio);
continue;
}
if (unlikely(buffer_heads_over_limit)) {
- if (page_has_private(page) && trylock_page(page)) {
- if (page_has_private(page))
- try_to_release_page(page, 0);
- unlock_page(page);
+ if (folio_get_private(folio) && folio_trylock(folio)) {
+ if (folio_get_private(folio))
+ filemap_release_folio(folio, 0);
+ folio_unlock(folio);
}
}
@@ -2502,34 +2561,34 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (folio_referenced(folio, 0, sc->target_mem_cgroup,
&vm_flags) != 0) {
/*
- * Identify referenced, file-backed active pages and
+ * Identify referenced, file-backed active folios and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
- * memory under moderate memory pressure. Anon pages
+ * memory under moderate memory pressure. Anon folios
* are not likely to be evicted by use-once streaming
- * IO, plus JVM can create lots of anon VM_EXEC pages,
+ * IO, plus JVM can create lots of anon VM_EXEC folios,
* so we ignore them here.
*/
- if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
- nr_rotated += thp_nr_pages(page);
- list_add(&page->lru, &l_active);
+ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
+ nr_rotated += folio_nr_pages(folio);
+ list_add(&folio->lru, &l_active);
continue;
}
}
- ClearPageActive(page); /* we are de-activating */
- SetPageWorkingset(page);
- list_add(&page->lru, &l_inactive);
+ folio_clear_active(folio); /* we are de-activating */
+ folio_set_workingset(folio);
+ list_add(&folio->lru, &l_inactive);
}
/*
- * Move pages back to the lru list.
+ * Move folios back to the lru list.
*/
spin_lock_irq(&lruvec->lru_lock);
nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
- /* Keep all free pages in l_active list */
+ /* Keep all free folios in l_active list */
list_splice(&l_inactive, &l_active);
__count_vm_events(PGDEACTIVATE, nr_deactivate);
@@ -2568,34 +2627,33 @@ static unsigned int reclaim_page_list(struct list_head *page_list,
return nr_reclaimed;
}
-unsigned long reclaim_pages(struct list_head *page_list)
+unsigned long reclaim_pages(struct list_head *folio_list)
{
int nid;
unsigned int nr_reclaimed = 0;
- LIST_HEAD(node_page_list);
- struct page *page;
+ LIST_HEAD(node_folio_list);
unsigned int noreclaim_flag;
- if (list_empty(page_list))
+ if (list_empty(folio_list))
return nr_reclaimed;
noreclaim_flag = memalloc_noreclaim_save();
- nid = page_to_nid(lru_to_page(page_list));
+ nid = folio_nid(lru_to_folio(folio_list));
do {
- page = lru_to_page(page_list);
+ struct folio *folio = lru_to_folio(folio_list);
- if (nid == page_to_nid(page)) {
- ClearPageActive(page);
- list_move(&page->lru, &node_page_list);
+ if (nid == folio_nid(folio)) {
+ folio_clear_active(folio);
+ list_move(&folio->lru, &node_folio_list);
continue;
}
- nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
- nid = page_to_nid(lru_to_page(page_list));
- } while (!list_empty(page_list));
+ nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
+ nid = folio_nid(lru_to_folio(folio_list));
+ } while (!list_empty(folio_list));
- nr_reclaimed += reclaim_page_list(&node_page_list, NODE_DATA(nid));
+ nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid));
memalloc_noreclaim_restore(noreclaim_flag);
@@ -3125,9 +3183,10 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
sc->priority);
/* Record the group's reclaim efficiency */
- vmpressure(sc->gfp_mask, memcg, false,
- sc->nr_scanned - scanned,
- sc->nr_reclaimed - reclaimed);
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, memcg, false,
+ sc->nr_scanned - scanned,
+ sc->nr_reclaimed - reclaimed);
} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
}
@@ -3250,9 +3309,10 @@ again:
}
/* Record the subtree's reclaim efficiency */
- vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
- sc->nr_scanned - nr_scanned,
- sc->nr_reclaimed - nr_reclaimed);
+ if (!sc->proactive)
+ vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+ sc->nr_scanned - nr_scanned,
+ sc->nr_reclaimed - nr_reclaimed);
if (sc->nr_reclaimed - nr_reclaimed)
reclaimable = true;
@@ -3534,8 +3594,9 @@ retry:
__count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do {
- vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
- sc->priority);
+ if (!sc->proactive)
+ vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+ sc->priority);
sc->nr_scanned = 0;
shrink_zones(zonelist, sc);
@@ -3825,7 +3886,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- bool may_swap)
+ unsigned int reclaim_options)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -3838,7 +3899,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.priority = DEF_PRIORITY,
.may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = may_swap,
+ .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
+ .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put
@@ -4595,7 +4657,7 @@ void kswapd_run(int nid)
/*
* Called by memory hotplug when all memory in a node is offlined. Caller must
- * hold mem_hotplug_begin/end().
+ * be holding mem_hotplug_begin/done().
*/
void kswapd_stop(int nid)
{
@@ -4790,45 +4852,57 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
}
#endif
+void check_move_unevictable_pages(struct pagevec *pvec)
+{
+ struct folio_batch fbatch;
+ unsigned i;
+
+ folio_batch_init(&fbatch);
+ for (i = 0; i < pvec->nr; i++) {
+ struct page *page = pvec->pages[i];
+
+ if (PageTransTail(page))
+ continue;
+ folio_batch_add(&fbatch, page_folio(page));
+ }
+ check_move_unevictable_folios(&fbatch);
+}
+EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
+
/**
- * check_move_unevictable_pages - check pages for evictability and move to
- * appropriate zone lru list
- * @pvec: pagevec with lru pages to check
+ * check_move_unevictable_folios - Move evictable folios to appropriate zone
+ * lru list
+ * @fbatch: Batch of lru folios to check.
*
- * Checks pages for evictability, if an evictable page is in the unevictable
+ * Checks folios for evictability, if an evictable folio is in the unevictable
* lru list, moves it to the appropriate evictable lru list. This function
- * should be only used for lru pages.
+ * should be only used for lru folios.
*/
-void check_move_unevictable_pages(struct pagevec *pvec)
+void check_move_unevictable_folios(struct folio_batch *fbatch)
{
struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
- for (i = 0; i < pvec->nr; i++) {
- struct page *page = pvec->pages[i];
- struct folio *folio = page_folio(page);
- int nr_pages;
-
- if (PageTransTail(page))
- continue;
+ for (i = 0; i < fbatch->nr; i++) {
+ struct folio *folio = fbatch->folios[i];
+ int nr_pages = folio_nr_pages(folio);
- nr_pages = thp_nr_pages(page);
pgscanned += nr_pages;
- /* block memcg migration during page moving between lru */
- if (!TestClearPageLRU(page))
+ /* block memcg migration while the folio moves between lrus */
+ if (!folio_test_clear_lru(folio))
continue;
lruvec = folio_lruvec_relock_irq(folio, lruvec);
- if (page_evictable(page) && PageUnevictable(page)) {
- del_page_from_lru_list(page, lruvec);
- ClearPageUnevictable(page);
- add_page_to_lru_list(page, lruvec);
+ if (folio_evictable(folio) && folio_test_unevictable(folio)) {
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_unevictable(folio);
+ lruvec_add_folio(lruvec, folio);
pgrescued += nr_pages;
}
- SetPageLRU(page);
+ folio_set_lru(folio);
}
if (lruvec) {
@@ -4839,4 +4913,4 @@ void check_move_unevictable_pages(struct pagevec *pvec)
count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
}
-EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
+EXPORT_SYMBOL_GPL(check_move_unevictable_folios);
diff --git a/mm/workingset.c b/mm/workingset.c
index 592569a8974c..a5e84862fc86 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -625,7 +625,7 @@ static int __init workingset_init(void)
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
timestamp_bits, max_order, bucket_order);
- ret = prealloc_shrinker(&workingset_shadow_shrinker);
+ ret = prealloc_shrinker(&workingset_shadow_shrinker, "mm-shadow");
if (ret)
goto err;
ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key,
diff --git a/mm/z3fold.c b/mm/z3fold.c
index f41f8b0d9e9a..cf71da10d04e 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -34,15 +34,11 @@
#include <linux/node.h>
#include <linux/compaction.h>
#include <linux/percpu.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
-#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
-#include <linux/magic.h>
#include <linux/kmemleak.h>
/*
@@ -149,7 +145,6 @@ struct z3fold_header {
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
- * @inode: inode for z3fold pseudo filesystem
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -169,7 +164,6 @@ struct z3fold_pool {
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
- struct inode *inode;
};
/*
@@ -334,54 +328,6 @@ static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
}
}
-static int z3fold_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type z3fold_fs = {
- .name = "z3fold",
- .init_fs_context = z3fold_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-static struct vfsmount *z3fold_mnt;
-static int __init z3fold_mount(void)
-{
- int ret = 0;
-
- z3fold_mnt = kern_mount(&z3fold_fs);
- if (IS_ERR(z3fold_mnt))
- ret = PTR_ERR(z3fold_mnt);
-
- return ret;
-}
-
-static void z3fold_unmount(void)
-{
- kern_unmount(z3fold_mnt);
-}
-
-static const struct address_space_operations z3fold_aops;
-static int z3fold_register_migration(struct z3fold_pool *pool)
-{
- pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
- if (IS_ERR(pool->inode)) {
- pool->inode = NULL;
- return 1;
- }
-
- pool->inode->i_mapping->private_data = pool;
- pool->inode->i_mapping->a_ops = &z3fold_aops;
- return 0;
-}
-
-static void z3fold_unregister_migration(struct z3fold_pool *pool)
-{
- if (pool->inode)
- iput(pool->inode);
-}
-
/* Initializes the z3fold header of a newly allocated z3fold page */
static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
struct z3fold_pool *pool, gfp_t gfp)
@@ -1002,14 +948,10 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
- if (z3fold_register_migration(pool))
- goto out_rwq;
INIT_WORK(&pool->work, free_pages_work);
pool->ops = ops;
return pool;
-out_rwq:
- destroy_workqueue(pool->release_wq);
out_wq:
destroy_workqueue(pool->compact_wq);
out_unbuddied:
@@ -1043,11 +985,12 @@ static void z3fold_destroy_pool(struct z3fold_pool *pool)
destroy_workqueue(pool->compact_wq);
destroy_workqueue(pool->release_wq);
- z3fold_unregister_migration(pool);
free_percpu(pool->unbuddied);
kfree(pool);
}
+static const struct movable_operations z3fold_mops;
+
/**
* z3fold_alloc() - allocates a region of a given size
* @pool: z3fold pool from which to allocate
@@ -1117,11 +1060,11 @@ retry:
}
if (can_sleep) {
lock_page(page);
- __SetPageMovable(page, pool->inode->i_mapping);
+ __SetPageMovable(page, &z3fold_mops);
unlock_page(page);
} else {
WARN_ON(!trylock_page(page));
- __SetPageMovable(page, pool->inode->i_mapping);
+ __SetPageMovable(page, &z3fold_mops);
unlock_page(page);
}
z3fold_page_lock(zhdr);
@@ -1554,12 +1497,11 @@ out:
return false;
}
-static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
+static int z3fold_page_migrate(struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
struct z3fold_header *zhdr, *new_zhdr;
struct z3fold_pool *pool;
- struct address_space *new_mapping;
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
@@ -1592,7 +1534,6 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
* so we only have to reinitialize it.
*/
INIT_LIST_HEAD(&new_zhdr->buddy);
- new_mapping = page_mapping(page);
__ClearPageMovable(page);
get_page(newpage);
@@ -1608,7 +1549,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
spin_lock(&pool->lock);
list_add(&newpage->lru, &pool->lru);
spin_unlock(&pool->lock);
- __SetPageMovable(newpage, new_mapping);
+ __SetPageMovable(newpage, &z3fold_mops);
z3fold_page_unlock(new_zhdr);
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
@@ -1642,9 +1583,9 @@ static void z3fold_page_putback(struct page *page)
z3fold_page_unlock(zhdr);
}
-static const struct address_space_operations z3fold_aops = {
+static const struct movable_operations z3fold_mops = {
.isolate_page = z3fold_page_isolate,
- .migratepage = z3fold_page_migrate,
+ .migrate_page = z3fold_page_migrate,
.putback_page = z3fold_page_putback,
};
@@ -1746,17 +1687,11 @@ MODULE_ALIAS("zpool-z3fold");
static int __init init_z3fold(void)
{
- int ret;
-
/*
* Make sure the z3fold header is not larger than the page size and
* there has remaining spaces for its buddy.
*/
BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
- ret = z3fold_mount();
- if (ret)
- return ret;
-
zpool_register_driver(&z3fold_zpool_driver);
return 0;
@@ -1764,7 +1699,6 @@ static int __init init_z3fold(void)
static void __exit exit_z3fold(void)
{
- z3fold_unmount();
zpool_unregister_driver(&z3fold_zpool_driver);
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 5d5fc04385b8..34f784a1604b 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -41,7 +41,6 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/magic.h>
#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/highmem.h>
@@ -59,8 +58,6 @@
#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
-#include <linux/mount.h>
-#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
#include <linux/wait.h>
#include <linux/pagemap.h>
@@ -177,10 +174,6 @@ struct zs_size_stat {
static struct dentry *zs_stat_root;
#endif
-#ifdef CONFIG_COMPACTION
-static struct vfsmount *zsmalloc_mnt;
-#endif
-
/*
* We assign a page to ZS_ALMOST_EMPTY fullness group when:
* n <= N / f, where
@@ -252,7 +245,6 @@ struct zs_pool {
struct dentry *stat_dentry;
#endif
#ifdef CONFIG_COMPACTION
- struct inode *inode;
struct work_struct free_work;
#endif
/* protect page/zspage migration */
@@ -271,6 +263,7 @@ struct zspage {
unsigned int freeobj;
struct page *first_page;
struct list_head list; /* fullness list */
+ struct zs_pool *pool;
#ifdef CONFIG_COMPACTION
rwlock_t lock;
#endif
@@ -295,8 +288,6 @@ static bool ZsHugePage(struct zspage *zspage)
}
#ifdef CONFIG_COMPACTION
-static int zs_register_migration(struct zs_pool *pool);
-static void zs_unregister_migration(struct zs_pool *pool);
static void migrate_lock_init(struct zspage *zspage);
static void migrate_read_lock(struct zspage *zspage);
static void migrate_read_unlock(struct zspage *zspage);
@@ -307,10 +298,6 @@ static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage);
#else
-static int zsmalloc_mount(void) { return 0; }
-static void zsmalloc_unmount(void) {}
-static int zs_register_migration(struct zs_pool *pool) { return 0; }
-static void zs_unregister_migration(struct zs_pool *pool) {}
static void migrate_lock_init(struct zspage *zspage) {}
static void migrate_read_lock(struct zspage *zspage) {}
static void migrate_read_unlock(struct zspage *zspage) {}
@@ -399,7 +386,10 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
*handle = zs_malloc(pool, size, gfp);
- return *handle ? 0 : -1;
+
+ if (IS_ERR((void *)(*handle)))
+ return PTR_ERR((void *)*handle);
+ return 0;
}
static void zs_zpool_free(void *pool, unsigned long handle)
{
@@ -1083,6 +1073,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
create_page_chain(class, zspage, pages);
init_zspage(class, zspage);
+ zspage->pool = pool;
return zspage;
}
@@ -1400,7 +1391,7 @@ static unsigned long obj_malloc(struct zs_pool *pool,
* @gfp: gfp flags when allocating object
*
* On success, handle to the allocated object is returned,
- * otherwise 0.
+ * otherwise an ERR_PTR().
* Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
*/
unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
@@ -1411,11 +1402,11 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
struct zspage *zspage;
if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
- return 0;
+ return (unsigned long)ERR_PTR(-EINVAL);
handle = cache_alloc_handle(pool, gfp);
if (!handle)
- return 0;
+ return (unsigned long)ERR_PTR(-ENOMEM);
/* extra space in chunk to keep the handle */
size += ZS_HANDLE_SIZE;
@@ -1440,7 +1431,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
zspage = alloc_zspage(pool, class, gfp);
if (!zspage) {
cache_free_handle(pool, handle);
- return 0;
+ return (unsigned long)ERR_PTR(-ENOMEM);
}
spin_lock(&class->lock);
@@ -1754,33 +1745,6 @@ static void lock_zspage(struct zspage *zspage)
migrate_read_unlock(zspage);
}
-static int zs_init_fs_context(struct fs_context *fc)
-{
- return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
-}
-
-static struct file_system_type zsmalloc_fs = {
- .name = "zsmalloc",
- .init_fs_context = zs_init_fs_context,
- .kill_sb = kill_anon_super,
-};
-
-static int zsmalloc_mount(void)
-{
- int ret = 0;
-
- zsmalloc_mnt = kern_mount(&zsmalloc_fs);
- if (IS_ERR(zsmalloc_mnt))
- ret = PTR_ERR(zsmalloc_mnt);
-
- return ret;
-}
-
-static void zsmalloc_unmount(void)
-{
- kern_unmount(zsmalloc_mnt);
-}
-
static void migrate_lock_init(struct zspage *zspage)
{
rwlock_init(&zspage->lock);
@@ -1823,6 +1787,8 @@ static void dec_zspage_isolation(struct zspage *zspage)
zspage->isolated--;
}
+static const struct movable_operations zsmalloc_mops;
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1843,7 +1809,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage,
set_first_obj_offset(newpage, get_first_obj_offset(oldpage));
if (unlikely(ZsHugePage(zspage)))
newpage->index = oldpage->index;
- __SetPageMovable(newpage, page_mapping(oldpage));
+ __SetPageMovable(newpage, &zsmalloc_mops);
}
static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
@@ -1865,8 +1831,8 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
return true;
}
-static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
- struct page *page, enum migrate_mode mode)
+static int zs_page_migrate(struct page *newpage, struct page *page,
+ enum migrate_mode mode)
{
struct zs_pool *pool;
struct size_class *class;
@@ -1889,14 +1855,15 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage,
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
- pool = mapping->private_data;
+ /* The page is locked, so this pointer must remain valid */
+ zspage = get_zspage(page);
+ pool = zspage->pool;
/*
* The pool migrate_lock protects the race between zpage migration
* and zs_free.
*/
write_lock(&pool->migrate_lock);
- zspage = get_zspage(page);
class = zspage_class(pool, zspage);
/*
@@ -1964,31 +1931,12 @@ static void zs_page_putback(struct page *page)
migrate_write_unlock(zspage);
}
-static const struct address_space_operations zsmalloc_aops = {
+static const struct movable_operations zsmalloc_mops = {
.isolate_page = zs_page_isolate,
- .migratepage = zs_page_migrate,
+ .migrate_page = zs_page_migrate,
.putback_page = zs_page_putback,
};
-static int zs_register_migration(struct zs_pool *pool)
-{
- pool->inode = alloc_anon_inode(zsmalloc_mnt->mnt_sb);
- if (IS_ERR(pool->inode)) {
- pool->inode = NULL;
- return 1;
- }
-
- pool->inode->i_mapping->private_data = pool;
- pool->inode->i_mapping->a_ops = &zsmalloc_aops;
- return 0;
-}
-
-static void zs_unregister_migration(struct zs_pool *pool)
-{
- flush_work(&pool->free_work);
- iput(pool->inode);
-}
-
/*
* Caller should hold page_lock of all pages in the zspage
* In here, we cannot use zspage meta data.
@@ -2032,6 +1980,11 @@ static void kick_deferred_free(struct zs_pool *pool)
schedule_work(&pool->free_work);
}
+static void zs_flush_migration(struct zs_pool *pool)
+{
+ flush_work(&pool->free_work);
+}
+
static void init_deferred_free(struct zs_pool *pool)
{
INIT_WORK(&pool->free_work, async_free_zspage);
@@ -2043,10 +1996,12 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage)
do {
WARN_ON(!trylock_page(page));
- __SetPageMovable(page, pool->inode->i_mapping);
+ __SetPageMovable(page, &zsmalloc_mops);
unlock_page(page);
} while ((page = get_next_page(page)) != NULL);
}
+#else
+static inline void zs_flush_migration(struct zs_pool *pool) { }
#endif
/*
@@ -2217,7 +2172,8 @@ static int zs_register_shrinker(struct zs_pool *pool)
pool->shrinker.batch = 0;
pool->shrinker.seeks = DEFAULT_SEEKS;
- return register_shrinker(&pool->shrinker);
+ return register_shrinker(&pool->shrinker, "mm-zspool:%s",
+ pool->name);
}
/**
@@ -2324,9 +2280,6 @@ struct zs_pool *zs_create_pool(const char *name)
/* debug only, don't abort if it fails */
zs_pool_stat_create(pool, name);
- if (zs_register_migration(pool))
- goto err;
-
/*
* Not critical since shrinker is only used to trigger internal
* defragmentation of the pool which is pretty optional thing. If
@@ -2348,7 +2301,7 @@ void zs_destroy_pool(struct zs_pool *pool)
int i;
zs_unregister_shrinker(pool);
- zs_unregister_migration(pool);
+ zs_flush_migration(pool);
zs_pool_stat_destroy(pool);
for (i = 0; i < ZS_SIZE_CLASSES; i++) {
@@ -2380,14 +2333,10 @@ static int __init zs_init(void)
{
int ret;
- ret = zsmalloc_mount();
- if (ret)
- goto out;
-
ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare",
zs_cpu_prepare, zs_cpu_dead);
if (ret)
- goto hp_setup_fail;
+ goto out;
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
@@ -2397,8 +2346,6 @@ static int __init zs_init(void)
return 0;
-hp_setup_fail:
- zsmalloc_unmount();
out:
return ret;
}
@@ -2408,7 +2355,6 @@ static void __exit zs_exit(void)
#ifdef CONFIG_ZPOOL
zpool_unregister_driver(&zs_zpool_driver);
#endif
- zsmalloc_unmount();
cpuhp_remove_state(CPUHP_MM_ZS_PREPARE);
zs_stat_exit();