diff options
author | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-02-21 11:19:49 -0800 |
---|---|---|
committer | Dmitry Torokhov <dmitry.torokhov@gmail.com> | 2023-02-21 11:19:49 -0800 |
commit | 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18 (patch) | |
tree | dbdd35328f43569c38c4ce193cefd7d2b6b9fbfd /mm | |
parent | 9c445d2637c938a800fcc8b5f0b10e60c94460c7 (diff) | |
parent | 9e69e845ae95227949c400af1037dca023f73038 (diff) | |
download | linux-stable-7ae9fb1b7ecbb5d85d07857943f677fd1a559b18.tar.gz linux-stable-7ae9fb1b7ecbb5d85d07857943f677fd1a559b18.tar.bz2 linux-stable-7ae9fb1b7ecbb5d85d07857943f677fd1a559b18.zip |
Merge branch 'next' into for-linus
Prepare input updates for 6.3 merge window.
Diffstat (limited to 'mm')
101 files changed, 6046 insertions, 4305 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 57e1d8c5b505..ff7b209dec05 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -219,17 +219,43 @@ config SLUB and has enhanced diagnostics. SLUB is the default choice for a slab allocator. -config SLOB +config SLOB_DEPRECATED depends on EXPERT - bool "SLOB (Simple Allocator)" + bool "SLOB (Simple Allocator - DEPRECATED)" depends on !PREEMPT_RT help + Deprecated and scheduled for removal in a few cycles. SLUB + recommended as replacement. CONFIG_SLUB_TINY can be considered + on systems with 16MB or less RAM. + + If you need SLOB to stay, please contact linux-mm@kvack.org and + people listed in the SLAB ALLOCATOR section of MAINTAINERS file, + with your use case. + SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but does not perform as well on large systems. endchoice +config SLOB + bool + default y + depends on SLOB_DEPRECATED + +config SLUB_TINY + bool "Configure SLUB for minimal memory footprint" + depends on SLUB && EXPERT + select SLAB_MERGE_DEFAULT + help + Configures the SLUB allocator in a way to achieve minimal memory + footprint, sacrificing scalability, debugging and other features. + This is intended only for the smallest system that had used the + SLOB allocator and is not recommended for systems with more than + 16MB RAM. + + If unsure, say N. + config SLAB_MERGE_DEFAULT bool "Allow slab caches to be merged" default y @@ -247,7 +273,7 @@ config SLAB_MERGE_DEFAULT config SLAB_FREELIST_RANDOM bool "Randomize slab freelist" - depends on SLAB || SLUB + depends on SLAB || (SLUB && !SLUB_TINY) help Randomizes the freelist order used on creating new pages. This security feature reduces the predictability of the kernel slab @@ -255,7 +281,7 @@ config SLAB_FREELIST_RANDOM config SLAB_FREELIST_HARDENED bool "Harden slab freelist metadata" - depends on SLAB || SLUB + depends on SLAB || (SLUB && !SLUB_TINY) help Many kernel heap attacks try to target slab cache metadata and other infrastructure. This options makes minor performance @@ -267,7 +293,7 @@ config SLAB_FREELIST_HARDENED config SLUB_STATS default n bool "Enable SLUB performance statistics" - depends on SLUB && SYSFS + depends on SLUB && SYSFS && !SLUB_TINY help SLUB statistics are useful to debug SLUBs allocation behavior in order find ways to optimize the allocator. This should never be @@ -279,7 +305,7 @@ config SLUB_STATS config SLUB_CPU_PARTIAL default y - depends on SLUB && SMP + depends on SLUB && SMP && !SLUB_TINY bool "SLUB per cpu partial cache" help Per cpu partial caches accelerate objects allocation and freeing @@ -775,7 +801,7 @@ endchoice config THP_SWAP def_bool y - depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP + depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP && 64BIT help Swap transparent huge pages in one piece, without splitting. XXX: For now, swap cluster backing transparent huge page @@ -1005,6 +1031,14 @@ config ARCH_USES_HIGH_VMA_FLAGS config ARCH_HAS_PKEYS bool +config ARCH_USES_PG_ARCH_X + bool + help + Enable the definition of PG_arch_x page flags with x > 1. Only + suitable for 64-bit architectures with CONFIG_FLATMEM or + CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be + enough room for additional bits in page->flags. + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EXPERT @@ -1044,7 +1078,7 @@ config GUP_TEST comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS -config GUP_GET_PTE_LOW_HIGH +config GUP_GET_PXX_LOW_HIGH bool config ARCH_HAS_PTE_SPECIAL @@ -1074,7 +1108,13 @@ config IO_MAPPING bool config SECRETMEM - def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + default y + bool "Enable memfd_secret() system call" if EXPERT + depends on ARCH_HAS_SET_DIRECT_MAP + help + Enable the memfd_secret() system call with the ability to create + memory areas visible only in the context of the owning process and + not mapped to other processes and other kernel page tables. config ANON_VMA_NAME bool "Anonymous VMA name support" @@ -1107,17 +1147,10 @@ config HAVE_ARCH_USERFAULTFD_MINOR help Arch has userfaultfd minor fault support -config PTE_MARKER - bool - - help - Allows to create marker PTEs for file-backed memory. - config PTE_MARKER_UFFD_WP bool "Userfaultfd write protection support for shmem/hugetlbfs" default y depends on HAVE_ARCH_USERFAULTFD_WP - select PTE_MARKER help Allows to create marker PTEs for userfaultfd write protection diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index ce8dded36de9..fca699ad1fb0 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -56,7 +56,7 @@ config DEBUG_SLAB config SLUB_DEBUG default y bool "Enable SLUB debugging support" if EXPERT - depends on SLUB && SYSFS + depends on SLUB && SYSFS && !SLUB_TINY select STACKDEPOT if STACKTRACE_SUPPORT help SLUB has extensive debug support features. Disabling these can diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c30419a5e119..a53b9360b72e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -178,7 +178,26 @@ static ssize_t min_ratio_store(struct device *dev, return ret; } -BDI_SHOW(min_ratio, bdi->min_ratio) +BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) + +static ssize_t min_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -197,7 +216,82 @@ static ssize_t max_ratio_store(struct device *dev, return ret; } -BDI_SHOW(max_ratio, bdi->max_ratio) +BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) + +static ssize_t max_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_ratio_fine, bdi->max_ratio) + +static ssize_t min_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); +} + +static ssize_t min_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_min_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(min_bytes); + +static ssize_t max_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); +} + +static ssize_t max_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_max_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, @@ -209,11 +303,44 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strict_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int strict_limit; + ssize_t ret; + + ret = kstrtouint(buf, 10, &strict_limit); + if (ret < 0) + return ret; + + ret = bdi_set_strict_limit(bdi, strict_limit); + if (!ret) + ret = count; + + return ret; +} + +static ssize_t strict_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strict_limit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, + &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, + &dev_attr_max_ratio_fine.attr, + &dev_attr_min_bytes.attr, + &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); @@ -780,7 +907,7 @@ int bdi_init(struct backing_dev_info *bdi) kref_init(&bdi->refcnt); bdi->min_ratio = 0; - bdi->max_ratio = 100; + bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); diff --git a/mm/compaction.c b/mm/compaction.c index c51f7f545afe..ca1603524bbe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -985,28 +985,28 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, } /* + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. + */ + if (unlikely(!get_page_unless_zero(page))) + goto isolate_fail; + + /* * Migration will fail if an anonymous page is pinned in memory, * so avoid taking lru_lock and isolating it unnecessarily in an * admittedly racy check. */ mapping = page_mapping(page); - if (!mapping && page_count(page) > page_mapcount(page)) - goto isolate_fail; + if (!mapping && (page_count(page) - 1) > total_mapcount(page)) + goto isolate_fail_put; /* * Only allow to migrate anonymous pages in GFP_NOFS context * because those do not depend on fs locks. */ if (!(cc->gfp_mask & __GFP_FS) && mapping) - goto isolate_fail; - - /* - * Be careful not to clear PageLRU until after we're - * sure the page is not being freed elsewhere -- the - * page release code relies on it. - */ - if (unlikely(!get_page_unless_zero(page))) - goto isolate_fail; + goto isolate_fail_put; /* Only take pages on LRU: a check now makes later tests safe */ if (!PageLRU(page)) @@ -1344,7 +1344,7 @@ move_freelist_tail(struct list_head *freelist, struct page *freepage) } static void -fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated) +fast_isolate_around(struct compact_control *cc, unsigned long pfn) { unsigned long start_pfn, end_pfn; struct page *page; @@ -1365,21 +1365,13 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long if (!page) return; - /* Scan before */ - if (start_pfn != pfn) { - isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false); - if (cc->nr_freepages >= cc->nr_migratepages) - return; - } - - /* Scan after */ - start_pfn = pfn + nr_isolated; - if (start_pfn < end_pfn) - isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); + isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ if (cc->nr_freepages < cc->nr_migratepages) set_pageblock_skip(page); + + return; } /* Search orders in round-robin fashion */ @@ -1556,7 +1548,7 @@ fast_isolate_freepages(struct compact_control *cc) return cc->free_pfn; low_pfn = page_to_pfn(page); - fast_isolate_around(cc, low_pfn, nr_isolated); + fast_isolate_around(cc, low_pfn); return low_pfn; } diff --git a/mm/damon/Makefile b/mm/damon/Makefile index 3e6b8ad73858..f7add3f4aa79 100644 --- a/mm/damon/Makefile +++ b/mm/damon/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_DAMON_VADDR) += ops-common.o vaddr.o obj-$(CONFIG_DAMON_PADDR) += ops-common.o paddr.o -obj-$(CONFIG_DAMON_SYSFS) += sysfs.o +obj-$(CONFIG_DAMON_SYSFS) += sysfs-common.o sysfs-schemes.o sysfs.o obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o -obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o -obj-$(CONFIG_DAMON_LRU_SORT) += lru_sort.o +obj-$(CONFIG_DAMON_RECLAIM) += modules-common.o reclaim.o +obj-$(CONFIG_DAMON_LRU_SORT) += modules-common.o lru_sort.o diff --git a/mm/damon/core.c b/mm/damon/core.c index 36d098d06c55..ceec75b88ef9 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -694,6 +694,115 @@ static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, return c->ops.get_scheme_score(c, t, r, s) >= s->quota.min_score; } +/* + * damos_skip_charged_region() - Check if the given region or starting part of + * it is already charged for the DAMOS quota. + * @t: The target of the region. + * @rp: The pointer to the region. + * @s: The scheme to be applied. + * + * If a quota of a scheme has exceeded in a quota charge window, the scheme's + * action would applied to only a part of the target access pattern fulfilling + * regions. To avoid applying the scheme action to only already applied + * regions, DAMON skips applying the scheme action to the regions that charged + * in the previous charge window. + * + * This function checks if a given region should be skipped or not for the + * reason. If only the starting part of the region has previously charged, + * this function splits the region into two so that the second one covers the + * area that not charged in the previous charge widnow and saves the second + * region in *rp and returns false, so that the caller can apply DAMON action + * to the second one. + * + * Return: true if the region should be entirely skipped, false otherwise. + */ +static bool damos_skip_charged_region(struct damon_target *t, + struct damon_region **rp, struct damos *s) +{ + struct damon_region *r = *rp; + struct damos_quota *quota = &s->quota; + unsigned long sz_to_skip; + + /* Skip previously charged regions */ + if (quota->charge_target_from) { + if (t != quota->charge_target_from) + return true; + if (r == damon_last_region(t)) { + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return true; + } + if (quota->charge_addr_from && + r->ar.end <= quota->charge_addr_from) + return true; + + if (quota->charge_addr_from && r->ar.start < + quota->charge_addr_from) { + sz_to_skip = ALIGN_DOWN(quota->charge_addr_from - + r->ar.start, DAMON_MIN_REGION); + if (!sz_to_skip) { + if (damon_sz_region(r) <= DAMON_MIN_REGION) + return true; + sz_to_skip = DAMON_MIN_REGION; + } + damon_split_region_at(t, r, sz_to_skip); + r = damon_next_region(r); + *rp = r; + } + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + } + return false; +} + +static void damos_update_stat(struct damos *s, + unsigned long sz_tried, unsigned long sz_applied) +{ + s->stat.nr_tried++; + s->stat.sz_tried += sz_tried; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; +} + +static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, + struct damon_region *r, struct damos *s) +{ + struct damos_quota *quota = &s->quota; + unsigned long sz = damon_sz_region(r); + struct timespec64 begin, end; + unsigned long sz_applied = 0; + int err = 0; + + if (c->ops.apply_scheme) { + if (quota->esz && quota->charged_sz + sz > quota->esz) { + sz = ALIGN_DOWN(quota->esz - quota->charged_sz, + DAMON_MIN_REGION); + if (!sz) + goto update_stat; + damon_split_region_at(t, r, sz); + } + ktime_get_coarse_ts64(&begin); + if (c->callback.before_damos_apply) + err = c->callback.before_damos_apply(c, t, r, s); + if (!err) + sz_applied = c->ops.apply_scheme(c, t, r, s); + ktime_get_coarse_ts64(&end); + quota->total_charged_ns += timespec64_to_ns(&end) - + timespec64_to_ns(&begin); + quota->charged_sz += sz; + if (quota->esz && quota->charged_sz >= quota->esz) { + quota->charge_target_from = t; + quota->charge_addr_from = r->ar.end + 1; + } + } + if (s->action != DAMOS_STAT) + r->age = 0; + +update_stat: + damos_update_stat(s, sz, sz_applied); +} + static void damon_do_apply_schemes(struct damon_ctx *c, struct damon_target *t, struct damon_region *r) @@ -702,9 +811,6 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = damon_sz_region(r); - struct timespec64 begin, end; - unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -713,70 +819,13 @@ static void damon_do_apply_schemes(struct damon_ctx *c, if (quota->esz && quota->charged_sz >= quota->esz) continue; - /* Skip previously charged regions */ - if (quota->charge_target_from) { - if (t != quota->charge_target_from) - continue; - if (r == damon_last_region(t)) { - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - continue; - } - if (quota->charge_addr_from && - r->ar.end <= quota->charge_addr_from) - continue; - - if (quota->charge_addr_from && r->ar.start < - quota->charge_addr_from) { - sz = ALIGN_DOWN(quota->charge_addr_from - - r->ar.start, DAMON_MIN_REGION); - if (!sz) { - if (damon_sz_region(r) <= - DAMON_MIN_REGION) - continue; - sz = DAMON_MIN_REGION; - } - damon_split_region_at(t, r, sz); - r = damon_next_region(r); - sz = damon_sz_region(r); - } - quota->charge_target_from = NULL; - quota->charge_addr_from = 0; - } + if (damos_skip_charged_region(t, &r, s)) + continue; if (!damos_valid_target(c, t, r, s)) continue; - /* Apply the scheme */ - if (c->ops.apply_scheme) { - if (quota->esz && - quota->charged_sz + sz > quota->esz) { - sz = ALIGN_DOWN(quota->esz - quota->charged_sz, - DAMON_MIN_REGION); - if (!sz) - goto update_stat; - damon_split_region_at(t, r, sz); - } - ktime_get_coarse_ts64(&begin); - sz_applied = c->ops.apply_scheme(c, t, r, s); - ktime_get_coarse_ts64(&end); - quota->total_charged_ns += timespec64_to_ns(&end) - - timespec64_to_ns(&begin); - quota->charged_sz += sz; - if (quota->esz && quota->charged_sz >= quota->esz) { - quota->charge_target_from = t; - quota->charge_addr_from = r->ar.end + 1; - } - } - if (s->action != DAMOS_STAT) - r->age = 0; - -update_stat: - s->stat.nr_tried++; - s->stat.sz_tried += sz; - if (sz_applied) - s->stat.nr_applied++; - s->stat.sz_applied += sz_applied; + damos_apply_scheme(c, t, r, s); } } @@ -803,59 +852,64 @@ static void damos_set_effective_quota(struct damos_quota *quota) quota->esz = esz; } -static void kdamond_apply_schemes(struct damon_ctx *c) +static void damos_adjust_quota(struct damon_ctx *c, struct damos *s) { + struct damos_quota *quota = &s->quota; struct damon_target *t; - struct damon_region *r, *next_r; - struct damos *s; + struct damon_region *r; + unsigned long cumulated_sz; + unsigned int score, max_score = 0; - damon_for_each_scheme(s, c) { - struct damos_quota *quota = &s->quota; - unsigned long cumulated_sz; - unsigned int score, max_score = 0; + if (!quota->ms && !quota->sz) + return; - if (!s->wmarks.activated) - continue; + /* New charge window starts */ + if (time_after_eq(jiffies, quota->charged_from + + msecs_to_jiffies(quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; + quota->total_charged_sz += quota->charged_sz; + quota->charged_from = jiffies; + quota->charged_sz = 0; + damos_set_effective_quota(quota); + } - if (!quota->ms && !quota->sz) - continue; + if (!c->ops.get_scheme_score) + return; - /* New charge window starts */ - if (time_after_eq(jiffies, quota->charged_from + - msecs_to_jiffies( - quota->reset_interval))) { - if (quota->esz && quota->charged_sz >= quota->esz) - s->stat.qt_exceeds++; - quota->total_charged_sz += quota->charged_sz; - quota->charged_from = jiffies; - quota->charged_sz = 0; - damos_set_effective_quota(quota); + /* Fill up the score histogram */ + memset(quota->histogram, 0, sizeof(quota->histogram)); + damon_for_each_target(t, c) { + damon_for_each_region(r, t) { + if (!__damos_valid_target(r, s)) + continue; + score = c->ops.get_scheme_score(c, t, r, s); + quota->histogram[score] += damon_sz_region(r); + if (score > max_score) + max_score = score; } + } - if (!c->ops.get_scheme_score) - continue; + /* Set the min score limit */ + for (cumulated_sz = 0, score = max_score; ; score--) { + cumulated_sz += quota->histogram[score]; + if (cumulated_sz >= quota->esz || !score) + break; + } + quota->min_score = score; +} - /* Fill up the score histogram */ - memset(quota->histogram, 0, sizeof(quota->histogram)); - damon_for_each_target(t, c) { - damon_for_each_region(r, t) { - if (!__damos_valid_target(r, s)) - continue; - score = c->ops.get_scheme_score( - c, t, r, s); - quota->histogram[score] += damon_sz_region(r); - if (score > max_score) - max_score = score; - } - } +static void kdamond_apply_schemes(struct damon_ctx *c) +{ + struct damon_target *t; + struct damon_region *r, *next_r; + struct damos *s; - /* Set the min score limit */ - for (cumulated_sz = 0, score = max_score; ; score--) { - cumulated_sz += quota->histogram[score]; - if (cumulated_sz >= quota->esz || !score) - break; - } - quota->min_score = score; + damon_for_each_scheme(s, c) { + if (!s->wmarks.activated) + continue; + + damos_adjust_quota(c, s); } damon_for_each_target(t, c) { diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 6f0ae7d3ae39..b3f454a5c682 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -890,6 +890,7 @@ out: static int dbgfs_rm_context(char *name) { struct dentry *root, *dir, **new_dirs; + struct inode *inode; struct damon_ctx **new_ctxs; int i, j; int ret = 0; @@ -905,6 +906,12 @@ static int dbgfs_rm_context(char *name) if (!dir) return -ENOENT; + inode = d_inode(dir); + if (!S_ISDIR(inode->i_mode)) { + ret = -EINVAL; + goto out_dput; + } + new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs), GFP_KERNEL); if (!new_dirs) { diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index efbc2bda8b9c..7b8fce2f67a8 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -8,10 +8,8 @@ #define pr_fmt(fmt) "damon-lru-sort: " fmt #include <linux/damon.h> -#include <linux/ioport.h> +#include <linux/kstrtox.h> #include <linux/module.h> -#include <linux/sched.h> -#include <linux/workqueue.h> #include "modules-common.h" @@ -237,38 +235,31 @@ static int damon_lru_sort_turn(bool on) return 0; } -static struct delayed_work damon_lru_sort_timer; -static void damon_lru_sort_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_lru_sort_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_lru_sort_timer, damon_lru_sort_timer_fn); - -static bool damon_lru_sort_initialized; - static int damon_lru_sort_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = kstrtobool(val, &enable); + if (err) + return err; - if (!damon_lru_sort_initialized) - return rc; + if (is_enabled == enable) + return 0; - schedule_delayed_work(&damon_lru_sort_timer, 0); + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; - return 0; + err = damon_lru_sort_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -314,29 +305,19 @@ static int damon_lru_sort_after_wmarks_check(struct damon_ctx *c) static int __init damon_lru_sort_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_lru_sort_after_wmarks_check; ctx->callback.after_aggregation = damon_lru_sort_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - - schedule_delayed_work(&damon_lru_sort_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_lru_sort_turn(true); - damon_lru_sort_initialized = true; - return 0; + return err; } module_init(damon_lru_sort_init); diff --git a/mm/damon/modules-common.c b/mm/damon/modules-common.c new file mode 100644 index 000000000000..b2381a8466ec --- /dev/null +++ b/mm/damon/modules-common.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park <sjpark@amazon.de> + */ + +#include <linux/damon.h> + +#include "modules-common.h" + +/* + * Allocate, set, and return a DAMON context for the physical address space. + * @ctxp: Pointer to save the point to the newly created context + * @targetp: Pointer to save the point to the newly created target + */ +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp) +{ + struct damon_ctx *ctx; + struct damon_target *target; + + ctx = damon_new_ctx(); + if (!ctx) + return -ENOMEM; + + if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { + damon_destroy_ctx(ctx); + return -EINVAL; + } + + target = damon_new_target(); + if (!target) { + damon_destroy_ctx(ctx); + return -ENOMEM; + } + damon_add_target(ctx, target); + + *ctxp = ctx; + *targetp = target; + return 0; +} diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 5a4921851d32..f49cdb417005 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -44,3 +44,6 @@ 0400); \ module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ 0400); + +int damon_modules_new_paddr_ctx_target(struct damon_ctx **ctxp, + struct damon_target **targetp); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 162c9b1ca00f..e82631f39481 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -8,10 +8,8 @@ #define pr_fmt(fmt) "damon-reclaim: " fmt #include <linux/damon.h> -#include <linux/ioport.h> +#include <linux/kstrtox.h> #include <linux/module.h> -#include <linux/sched.h> -#include <linux/workqueue.h> #include "modules-common.h" @@ -183,38 +181,31 @@ static int damon_reclaim_turn(bool on) return 0; } -static struct delayed_work damon_reclaim_timer; -static void damon_reclaim_timer_fn(struct work_struct *work) -{ - static bool last_enabled; - bool now_enabled; - - now_enabled = enabled; - if (last_enabled != now_enabled) { - if (!damon_reclaim_turn(now_enabled)) - last_enabled = now_enabled; - else - enabled = last_enabled; - } -} -static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); - -static bool damon_reclaim_initialized; - static int damon_reclaim_enabled_store(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); + bool is_enabled = enabled; + bool enable; + int err; - if (rc < 0) - return rc; + err = kstrtobool(val, &enable); + if (err) + return err; - /* system_wq might not initialized yet */ - if (!damon_reclaim_initialized) - return rc; + if (is_enabled == enable) + return 0; - schedule_delayed_work(&damon_reclaim_timer, 0); - return 0; + /* Called before init function. The function will handle this. */ + if (!ctx) + goto set_param_out; + + err = damon_reclaim_turn(enable); + if (err) + return err; + +set_param_out: + enabled = enable; + return err; } static const struct kernel_param_ops enabled_param_ops = { @@ -256,29 +247,19 @@ static int damon_reclaim_after_wmarks_check(struct damon_ctx *c) static int __init damon_reclaim_init(void) { - ctx = damon_new_ctx(); - if (!ctx) - return -ENOMEM; + int err = damon_modules_new_paddr_ctx_target(&ctx, &target); - if (damon_select_ops(ctx, DAMON_OPS_PADDR)) { - damon_destroy_ctx(ctx); - return -EINVAL; - } + if (err) + return err; ctx->callback.after_wmarks_check = damon_reclaim_after_wmarks_check; ctx->callback.after_aggregation = damon_reclaim_after_aggregation; - target = damon_new_target(); - if (!target) { - damon_destroy_ctx(ctx); - return -ENOMEM; - } - damon_add_target(ctx, target); - - schedule_delayed_work(&damon_reclaim_timer, 0); + /* 'enabled' has set before this function, probably via command line */ + if (enabled) + err = damon_reclaim_turn(true); - damon_reclaim_initialized = true; - return 0; + return err; } module_init(damon_reclaim_init); diff --git a/mm/damon/sysfs-common.c b/mm/damon/sysfs-common.c new file mode 100644 index 000000000000..52bebf242f74 --- /dev/null +++ b/mm/damon/sysfs-common.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park <sj@kernel.org> + */ + +#include <linux/slab.h> + +#include "sysfs-common.h" + +DEFINE_MUTEX(damon_sysfs_lock); + +/* + * unsigned long range directory + */ + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max) +{ + struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), + GFP_KERNEL); + + if (!range) + return NULL; + range->kobj = (struct kobject){}; + range->min = min; + range->max = max; + + return range; +} + +static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->min); +} + +static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long min; + int err; + + err = kstrtoul(buf, 0, &min); + if (err) + return err; + + range->min = min; + return count; +} + +static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + + return sysfs_emit(buf, "%lu\n", range->max); +} + +static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_ul_range *range = container_of(kobj, + struct damon_sysfs_ul_range, kobj); + unsigned long max; + int err; + + err = kstrtoul(buf, 0, &max); + if (err) + return err; + + range->max = max; + return count; +} + +void damon_sysfs_ul_range_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); +} + +static struct kobj_attribute damon_sysfs_ul_range_min_attr = + __ATTR_RW_MODE(min, 0600); + +static struct kobj_attribute damon_sysfs_ul_range_max_attr = + __ATTR_RW_MODE(max, 0600); + +static struct attribute *damon_sysfs_ul_range_attrs[] = { + &damon_sysfs_ul_range_min_attr.attr, + &damon_sysfs_ul_range_max_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_ul_range); + +struct kobj_type damon_sysfs_ul_range_ktype = { + .release = damon_sysfs_ul_range_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_ul_range_groups, +}; + diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h new file mode 100644 index 000000000000..604a6cbc3ede --- /dev/null +++ b/mm/damon/sysfs-common.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Sysfs Interface + * + * Author: SeongJae Park <sj@kernel.org> + */ + +#include <linux/damon.h> +#include <linux/kobject.h> + +extern struct mutex damon_sysfs_lock; + +struct damon_sysfs_ul_range { + struct kobject kobj; + unsigned long min; + unsigned long max; +}; + +struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( + unsigned long min, + unsigned long max); +void damon_sysfs_ul_range_release(struct kobject *kobj); + +extern struct kobj_type damon_sysfs_ul_range_ktype; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes { + struct kobject kobj; + struct damon_sysfs_scheme **schemes_arr; + int nr; +}; + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void); +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes); + +extern struct kobj_type damon_sysfs_schemes_ktype; + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes); + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); + +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); + +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c new file mode 100644 index 000000000000..81fc4d27f4e4 --- /dev/null +++ b/mm/damon/sysfs-schemes.c @@ -0,0 +1,1338 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * DAMON sysfs Interface + * + * Copyright (c) 2022 SeongJae Park <sj@kernel.org> + */ + +#include <linux/slab.h> + +#include "sysfs-common.h" + +/* + * scheme region directory + */ + +struct damon_sysfs_scheme_region { + struct kobject kobj; + struct damon_addr_range ar; + unsigned int nr_accesses; + unsigned int age; + struct list_head list; +}; + +static struct damon_sysfs_scheme_region *damon_sysfs_scheme_region_alloc( + struct damon_region *region) +{ + struct damon_sysfs_scheme_region *sysfs_region = kmalloc( + sizeof(*sysfs_region), GFP_KERNEL); + + if (!sysfs_region) + return NULL; + sysfs_region->kobj = (struct kobject){}; + sysfs_region->ar = region->ar; + sysfs_region->nr_accesses = region->nr_accesses; + sysfs_region->age = region->age; + INIT_LIST_HEAD(&sysfs_region->list); + return sysfs_region; +} + +static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.start); +} + +static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%lu\n", region->ar.end); +} + +static ssize_t nr_accesses_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->nr_accesses); +} + +static ssize_t age_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + return sysfs_emit(buf, "%u\n", region->age); +} + +static void damon_sysfs_scheme_region_release(struct kobject *kobj) +{ + struct damon_sysfs_scheme_region *region = container_of(kobj, + struct damon_sysfs_scheme_region, kobj); + + list_del(®ion->list); + kfree(region); +} + +static struct kobj_attribute damon_sysfs_scheme_region_start_attr = + __ATTR_RO_MODE(start, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_end_attr = + __ATTR_RO_MODE(end, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_nr_accesses_attr = + __ATTR_RO_MODE(nr_accesses, 0400); + +static struct kobj_attribute damon_sysfs_scheme_region_age_attr = + __ATTR_RO_MODE(age, 0400); + +static struct attribute *damon_sysfs_scheme_region_attrs[] = { + &damon_sysfs_scheme_region_start_attr.attr, + &damon_sysfs_scheme_region_end_attr.attr, + &damon_sysfs_scheme_region_nr_accesses_attr.attr, + &damon_sysfs_scheme_region_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_region); + +static struct kobj_type damon_sysfs_scheme_region_ktype = { + .release = damon_sysfs_scheme_region_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_region_groups, +}; + +/* + * scheme regions directory + */ + +struct damon_sysfs_scheme_regions { + struct kobject kobj; + struct list_head regions_list; + int nr_regions; +}; + +static struct damon_sysfs_scheme_regions * +damon_sysfs_scheme_regions_alloc(void) +{ + struct damon_sysfs_scheme_regions *regions = kmalloc(sizeof(*regions), + GFP_KERNEL); + + regions->kobj = (struct kobject){}; + INIT_LIST_HEAD(®ions->regions_list); + regions->nr_regions = 0; + return regions; +} + +static void damon_sysfs_scheme_regions_rm_dirs( + struct damon_sysfs_scheme_regions *regions) +{ + struct damon_sysfs_scheme_region *r, *next; + + list_for_each_entry_safe(r, next, ®ions->regions_list, list) { + /* release function deletes it from the list */ + kobject_put(&r->kobj); + regions->nr_regions--; + } +} + +static void damon_sysfs_scheme_regions_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); +} + +static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); + +static struct kobj_type damon_sysfs_scheme_regions_ktype = { + .release = damon_sysfs_scheme_regions_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_regions_groups, +}; + +/* + * schemes/stats directory + */ + +struct damon_sysfs_stats { + struct kobject kobj; + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + +static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); +} + +static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_tried); +} + +static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_tried); +} + +static ssize_t nr_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->nr_applied); +} + +static ssize_t sz_applied_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->sz_applied); +} + +static ssize_t qt_exceeds_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_stats *stats = container_of(kobj, + struct damon_sysfs_stats, kobj); + + return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); +} + +static void damon_sysfs_stats_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); +} + +static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = + __ATTR_RO_MODE(nr_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = + __ATTR_RO_MODE(sz_tried, 0400); + +static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = + __ATTR_RO_MODE(nr_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = + __ATTR_RO_MODE(sz_applied, 0400); + +static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = + __ATTR_RO_MODE(qt_exceeds, 0400); + +static struct attribute *damon_sysfs_stats_attrs[] = { + &damon_sysfs_stats_nr_tried_attr.attr, + &damon_sysfs_stats_sz_tried_attr.attr, + &damon_sysfs_stats_nr_applied_attr.attr, + &damon_sysfs_stats_sz_applied_attr.attr, + &damon_sysfs_stats_qt_exceeds_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_stats); + +static struct kobj_type damon_sysfs_stats_ktype = { + .release = damon_sysfs_stats_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_stats_groups, +}; + +/* + * watermarks directory + */ + +struct damon_sysfs_watermarks { + struct kobject kobj; + enum damos_wmark_metric metric; + unsigned long interval_us; + unsigned long high; + unsigned long mid; + unsigned long low; +}; + +static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( + enum damos_wmark_metric metric, unsigned long interval_us, + unsigned long high, unsigned long mid, unsigned long low) +{ + struct damon_sysfs_watermarks *watermarks = kmalloc( + sizeof(*watermarks), GFP_KERNEL); + + if (!watermarks) + return NULL; + watermarks->kobj = (struct kobject){}; + watermarks->metric = metric; + watermarks->interval_us = interval_us; + watermarks->high = high; + watermarks->mid = mid; + watermarks->low = low; + return watermarks; +} + +/* Should match with enum damos_wmark_metric */ +static const char * const damon_sysfs_wmark_metric_strs[] = { + "none", + "free_mem_rate", +}; + +static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_wmark_metric_strs[watermarks->metric]); +} + +static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + enum damos_wmark_metric metric; + + for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { + if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { + watermarks->metric = metric; + return count; + } + } + return -EINVAL; +} + +static ssize_t interval_us_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->interval_us); +} + +static ssize_t interval_us_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->interval_us); + + return err ? err : count; +} + +static ssize_t high_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->high); +} + +static ssize_t high_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->high); + + return err ? err : count; +} + +static ssize_t mid_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->mid); +} + +static ssize_t mid_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->mid); + + return err ? err : count; +} + +static ssize_t low_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + + return sysfs_emit(buf, "%lu\n", watermarks->low); +} + +static ssize_t low_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_watermarks *watermarks = container_of(kobj, + struct damon_sysfs_watermarks, kobj); + int err = kstrtoul(buf, 0, &watermarks->low); + + return err ? err : count; +} + +static void damon_sysfs_watermarks_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); +} + +static struct kobj_attribute damon_sysfs_watermarks_metric_attr = + __ATTR_RW_MODE(metric, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = + __ATTR_RW_MODE(interval_us, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_high_attr = + __ATTR_RW_MODE(high, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_mid_attr = + __ATTR_RW_MODE(mid, 0600); + +static struct kobj_attribute damon_sysfs_watermarks_low_attr = + __ATTR_RW_MODE(low, 0600); + +static struct attribute *damon_sysfs_watermarks_attrs[] = { + &damon_sysfs_watermarks_metric_attr.attr, + &damon_sysfs_watermarks_interval_us_attr.attr, + &damon_sysfs_watermarks_high_attr.attr, + &damon_sysfs_watermarks_mid_attr.attr, + &damon_sysfs_watermarks_low_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_watermarks); + +static struct kobj_type damon_sysfs_watermarks_ktype = { + .release = damon_sysfs_watermarks_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_watermarks_groups, +}; + +/* + * scheme/weights directory + */ + +struct damon_sysfs_weights { + struct kobject kobj; + unsigned int sz; + unsigned int nr_accesses; + unsigned int age; +}; + +static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, + unsigned int nr_accesses, unsigned int age) +{ + struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), + GFP_KERNEL); + + if (!weights) + return NULL; + weights->kobj = (struct kobject){}; + weights->sz = sz; + weights->nr_accesses = nr_accesses; + weights->age = age; + return weights; +} + +static ssize_t sz_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->sz); +} + +static ssize_t sz_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->sz); + + return err ? err : count; +} + +static ssize_t nr_accesses_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->nr_accesses); +} + +static ssize_t nr_accesses_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->nr_accesses); + + return err ? err : count; +} + +static ssize_t age_permil_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + + return sysfs_emit(buf, "%u\n", weights->age); +} + +static ssize_t age_permil_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_weights *weights = container_of(kobj, + struct damon_sysfs_weights, kobj); + int err = kstrtouint(buf, 0, &weights->age); + + return err ? err : count; +} + +static void damon_sysfs_weights_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); +} + +static struct kobj_attribute damon_sysfs_weights_sz_attr = + __ATTR_RW_MODE(sz_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = + __ATTR_RW_MODE(nr_accesses_permil, 0600); + +static struct kobj_attribute damon_sysfs_weights_age_attr = + __ATTR_RW_MODE(age_permil, 0600); + +static struct attribute *damon_sysfs_weights_attrs[] = { + &damon_sysfs_weights_sz_attr.attr, + &damon_sysfs_weights_nr_accesses_attr.attr, + &damon_sysfs_weights_age_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_weights); + +static struct kobj_type damon_sysfs_weights_ktype = { + .release = damon_sysfs_weights_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_weights_groups, +}; + +/* + * quotas directory + */ + +struct damon_sysfs_quotas { + struct kobject kobj; + struct damon_sysfs_weights *weights; + unsigned long ms; + unsigned long sz; + unsigned long reset_interval_ms; +}; + +static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); +} + +static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) +{ + struct damon_sysfs_weights *weights; + int err; + + weights = damon_sysfs_weights_alloc(0, 0, 0); + if (!weights) + return -ENOMEM; + + err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, + "as->kobj, "weights"); + if (err) + kobject_put(&weights->kobj); + else + quotas->weights = weights; + return err; +} + +static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) +{ + kobject_put("as->weights->kobj); +} + +static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->ms); +} + +static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->ms); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->sz); +} + +static ssize_t bytes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->sz); + + if (err) + return -EINVAL; + return count; +} + +static ssize_t reset_interval_ms_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + + return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); +} + +static ssize_t reset_interval_ms_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_quotas *quotas = container_of(kobj, + struct damon_sysfs_quotas, kobj); + int err = kstrtoul(buf, 0, "as->reset_interval_ms); + + if (err) + return -EINVAL; + return count; +} + +static void damon_sysfs_quotas_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); +} + +static struct kobj_attribute damon_sysfs_quotas_ms_attr = + __ATTR_RW_MODE(ms, 0600); + +static struct kobj_attribute damon_sysfs_quotas_sz_attr = + __ATTR_RW_MODE(bytes, 0600); + +static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = + __ATTR_RW_MODE(reset_interval_ms, 0600); + +static struct attribute *damon_sysfs_quotas_attrs[] = { + &damon_sysfs_quotas_ms_attr.attr, + &damon_sysfs_quotas_sz_attr.attr, + &damon_sysfs_quotas_reset_interval_ms_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_quotas); + +static struct kobj_type damon_sysfs_quotas_ktype = { + .release = damon_sysfs_quotas_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_quotas_groups, +}; + +/* + * access_pattern directory + */ + +struct damon_sysfs_access_pattern { + struct kobject kobj; + struct damon_sysfs_ul_range *sz; + struct damon_sysfs_ul_range *nr_accesses; + struct damon_sysfs_ul_range *age; +}; + +static +struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) +{ + struct damon_sysfs_access_pattern *access_pattern = + kmalloc(sizeof(*access_pattern), GFP_KERNEL); + + if (!access_pattern) + return NULL; + access_pattern->kobj = (struct kobject){}; + return access_pattern; +} + +static int damon_sysfs_access_pattern_add_range_dir( + struct damon_sysfs_access_pattern *access_pattern, + struct damon_sysfs_ul_range **range_dir_ptr, + char *name) +{ + struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); + int err; + + if (!range) + return -ENOMEM; + err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, + &access_pattern->kobj, name); + if (err) + kobject_put(&range->kobj); + else + *range_dir_ptr = range; + return err; +} + +static int damon_sysfs_access_pattern_add_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + int err; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->sz, "sz"); + if (err) + goto put_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->nr_accesses, "nr_accesses"); + if (err) + goto put_nr_accesses_sz_out; + + err = damon_sysfs_access_pattern_add_range_dir(access_pattern, + &access_pattern->age, "age"); + if (err) + goto put_age_nr_accesses_sz_out; + return 0; + +put_age_nr_accesses_sz_out: + kobject_put(&access_pattern->age->kobj); + access_pattern->age = NULL; +put_nr_accesses_sz_out: + kobject_put(&access_pattern->nr_accesses->kobj); + access_pattern->nr_accesses = NULL; +put_sz_out: + kobject_put(&access_pattern->sz->kobj); + access_pattern->sz = NULL; + return err; +} + +static void damon_sysfs_access_pattern_rm_dirs( + struct damon_sysfs_access_pattern *access_pattern) +{ + kobject_put(&access_pattern->sz->kobj); + kobject_put(&access_pattern->nr_accesses->kobj); + kobject_put(&access_pattern->age->kobj); +} + +static void damon_sysfs_access_pattern_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); +} + +static struct attribute *damon_sysfs_access_pattern_attrs[] = { + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); + +static struct kobj_type damon_sysfs_access_pattern_ktype = { + .release = damon_sysfs_access_pattern_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_access_pattern_groups, +}; + +/* + * scheme directory + */ + +struct damon_sysfs_scheme { + struct kobject kobj; + enum damos_action action; + struct damon_sysfs_access_pattern *access_pattern; + struct damon_sysfs_quotas *quotas; + struct damon_sysfs_watermarks *watermarks; + struct damon_sysfs_stats *stats; + struct damon_sysfs_scheme_regions *tried_regions; +}; + +/* This should match with enum damos_action */ +static const char * const damon_sysfs_damos_action_strs[] = { + "willneed", + "cold", + "pageout", + "hugepage", + "nohugepage", + "lru_prio", + "lru_deprio", + "stat", +}; + +static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( + enum damos_action action) +{ + struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), + GFP_KERNEL); + + if (!scheme) + return NULL; + scheme->kobj = (struct kobject){}; + scheme->action = action; + return scheme; +} + +static int damon_sysfs_scheme_set_access_pattern( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_access_pattern *access_pattern; + int err; + + access_pattern = damon_sysfs_access_pattern_alloc(); + if (!access_pattern) + return -ENOMEM; + err = kobject_init_and_add(&access_pattern->kobj, + &damon_sysfs_access_pattern_ktype, &scheme->kobj, + "access_pattern"); + if (err) + goto out; + err = damon_sysfs_access_pattern_add_dirs(access_pattern); + if (err) + goto out; + scheme->access_pattern = access_pattern; + return 0; + +out: + kobject_put(&access_pattern->kobj); + return err; +} + +static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); + int err; + + if (!quotas) + return -ENOMEM; + err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, + &scheme->kobj, "quotas"); + if (err) + goto out; + err = damon_sysfs_quotas_add_dirs(quotas); + if (err) + goto out; + scheme->quotas = quotas; + return 0; + +out: + kobject_put("as->kobj); + return err; +} + +static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_watermarks *watermarks = + damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); + int err; + + if (!watermarks) + return -ENOMEM; + err = kobject_init_and_add(&watermarks->kobj, + &damon_sysfs_watermarks_ktype, &scheme->kobj, + "watermarks"); + if (err) + kobject_put(&watermarks->kobj); + else + scheme->watermarks = watermarks; + return err; +} + +static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); + int err; + + if (!stats) + return -ENOMEM; + err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, + &scheme->kobj, "stats"); + if (err) + kobject_put(&stats->kobj); + else + scheme->stats = stats; + return err; +} + +static int damon_sysfs_scheme_set_tried_regions( + struct damon_sysfs_scheme *scheme) +{ + struct damon_sysfs_scheme_regions *tried_regions = + damon_sysfs_scheme_regions_alloc(); + int err; + + if (!tried_regions) + return -ENOMEM; + err = kobject_init_and_add(&tried_regions->kobj, + &damon_sysfs_scheme_regions_ktype, &scheme->kobj, + "tried_regions"); + if (err) + kobject_put(&tried_regions->kobj); + else + scheme->tried_regions = tried_regions; + return err; +} + +static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) +{ + int err; + + err = damon_sysfs_scheme_set_access_pattern(scheme); + if (err) + return err; + err = damon_sysfs_scheme_set_quotas(scheme); + if (err) + goto put_access_pattern_out; + err = damon_sysfs_scheme_set_watermarks(scheme); + if (err) + goto put_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_stats(scheme); + if (err) + goto put_watermarks_quotas_access_pattern_out; + err = damon_sysfs_scheme_set_tried_regions(scheme); + if (err) + goto put_tried_regions_out; + return 0; + +put_tried_regions_out: + kobject_put(&scheme->tried_regions->kobj); + scheme->tried_regions = NULL; +put_watermarks_quotas_access_pattern_out: + kobject_put(&scheme->watermarks->kobj); + scheme->watermarks = NULL; +put_quotas_access_pattern_out: + kobject_put(&scheme->quotas->kobj); + scheme->quotas = NULL; +put_access_pattern_out: + kobject_put(&scheme->access_pattern->kobj); + scheme->access_pattern = NULL; + return err; +} + +static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) +{ + damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); + kobject_put(&scheme->access_pattern->kobj); + damon_sysfs_quotas_rm_dirs(scheme->quotas); + kobject_put(&scheme->quotas->kobj); + kobject_put(&scheme->watermarks->kobj); + kobject_put(&scheme->stats->kobj); + damon_sysfs_scheme_regions_rm_dirs(scheme->tried_regions); + kobject_put(&scheme->tried_regions->kobj); +} + +static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + + return sysfs_emit(buf, "%s\n", + damon_sysfs_damos_action_strs[scheme->action]); +} + +static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct damon_sysfs_scheme *scheme = container_of(kobj, + struct damon_sysfs_scheme, kobj); + enum damos_action action; + + for (action = 0; action < NR_DAMOS_ACTIONS; action++) { + if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { + scheme->action = action; + return count; + } + } + return -EINVAL; +} + +static void damon_sysfs_scheme_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); +} + +static struct kobj_attribute damon_sysfs_scheme_action_attr = + __ATTR_RW_MODE(action, 0600); + +static struct attribute *damon_sysfs_scheme_attrs[] = { + &damon_sysfs_scheme_action_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_scheme); + +static struct kobj_type damon_sysfs_scheme_ktype = { + .release = damon_sysfs_scheme_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_scheme_groups, +}; + +/* + * schemes directory + */ + +struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) +{ + return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); +} + +void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) +{ + struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; + int i; + + for (i = 0; i < schemes->nr; i++) { + damon_sysfs_scheme_rm_dirs(schemes_arr[i]); + kobject_put(&schemes_arr[i]->kobj); + } + schemes->nr = 0; + kfree(schemes_arr); + schemes->schemes_arr = NULL; +} + +static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, + int nr_schemes) +{ + struct damon_sysfs_scheme **schemes_arr, *scheme; + int err, i; + + damon_sysfs_schemes_rm_dirs(schemes); + if (!nr_schemes) + return 0; + + schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), + GFP_KERNEL | __GFP_NOWARN); + if (!schemes_arr) + return -ENOMEM; + schemes->schemes_arr = schemes_arr; + + for (i = 0; i < nr_schemes; i++) { + scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); + if (!scheme) { + damon_sysfs_schemes_rm_dirs(schemes); + return -ENOMEM; + } + + err = kobject_init_and_add(&scheme->kobj, + &damon_sysfs_scheme_ktype, &schemes->kobj, + "%d", i); + if (err) + goto out; + err = damon_sysfs_scheme_add_dirs(scheme); + if (err) + goto out; + + schemes_arr[i] = scheme; + schemes->nr++; + } + return 0; + +out: + damon_sysfs_schemes_rm_dirs(schemes); + kobject_put(&scheme->kobj); + return err; +} + +static ssize_t nr_schemes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_schemes *schemes = container_of(kobj, + struct damon_sysfs_schemes, kobj); + + return sysfs_emit(buf, "%d\n", schemes->nr); +} + +static ssize_t nr_schemes_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_schemes *schemes; + int nr, err = kstrtoint(buf, 0, &nr); + + if (err) + return err; + if (nr < 0) + return -EINVAL; + + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + + if (!mutex_trylock(&damon_sysfs_lock)) + return -EBUSY; + err = damon_sysfs_schemes_add_dirs(schemes, nr); + mutex_unlock(&damon_sysfs_lock); + if (err) + return err; + return count; +} + +static void damon_sysfs_schemes_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); +} + +static struct kobj_attribute damon_sysfs_schemes_nr_attr = + __ATTR_RW_MODE(nr_schemes, 0600); + +static struct attribute *damon_sysfs_schemes_attrs[] = { + &damon_sysfs_schemes_nr_attr.attr, + NULL, +}; +ATTRIBUTE_GROUPS(damon_sysfs_schemes); + +struct kobj_type damon_sysfs_schemes_ktype = { + .release = damon_sysfs_schemes_release, + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = damon_sysfs_schemes_groups, +}; + +static struct damos *damon_sysfs_mk_scheme( + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; + struct damos_quota quota = { + .ms = sysfs_quotas->ms, + .sz = sysfs_quotas->sz, + .reset_interval = sysfs_quotas->reset_interval_ms, + .weight_sz = sysfs_weights->sz, + .weight_nr_accesses = sysfs_weights->nr_accesses, + .weight_age = sysfs_weights->age, + }; + struct damos_watermarks wmarks = { + .metric = sysfs_wmarks->metric, + .interval = sysfs_wmarks->interval_us, + .high = sysfs_wmarks->high, + .mid = sysfs_wmarks->mid, + .low = sysfs_wmarks->low, + }; + + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); +} + +static void damon_sysfs_update_scheme(struct damos *scheme, + struct damon_sysfs_scheme *sysfs_scheme) +{ + struct damon_sysfs_access_pattern *access_pattern = + sysfs_scheme->access_pattern; + struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; + struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; + struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + scheme->pattern.min_sz_region = access_pattern->sz->min; + scheme->pattern.max_sz_region = access_pattern->sz->max; + scheme->pattern.min_nr_accesses = access_pattern->nr_accesses->min; + scheme->pattern.max_nr_accesses = access_pattern->nr_accesses->max; + scheme->pattern.min_age_region = access_pattern->age->min; + scheme->pattern.max_age_region = access_pattern->age->max; + + scheme->action = sysfs_scheme->action; + + scheme->quota.ms = sysfs_quotas->ms; + scheme->quota.sz = sysfs_quotas->sz; + scheme->quota.reset_interval = sysfs_quotas->reset_interval_ms; + scheme->quota.weight_sz = sysfs_weights->sz; + scheme->quota.weight_nr_accesses = sysfs_weights->nr_accesses; + scheme->quota.weight_age = sysfs_weights->age; + + scheme->wmarks.metric = sysfs_wmarks->metric; + scheme->wmarks.interval = sysfs_wmarks->interval_us; + scheme->wmarks.high = sysfs_wmarks->high; + scheme->wmarks.mid = sysfs_wmarks->mid; + scheme->wmarks.low = sysfs_wmarks->low; +} + +int damon_sysfs_set_schemes(struct damon_ctx *ctx, + struct damon_sysfs_schemes *sysfs_schemes) +{ + struct damos *scheme, *next; + int i = 0; + + damon_for_each_scheme_safe(scheme, next, ctx) { + if (i < sysfs_schemes->nr) + damon_sysfs_update_scheme(scheme, + sysfs_schemes->schemes_arr[i]); + else + damon_destroy_scheme(scheme); + i++; + } + + for (; i < sysfs_schemes->nr; i++) { + struct damos *scheme, *next; + + scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); + if (!scheme) { + damon_for_each_scheme_safe(scheme, next, ctx) + damon_destroy_scheme(scheme); + return -ENOMEM; + } + damon_add_scheme(ctx, scheme); + } + return 0; +} + +void damon_sysfs_schemes_update_stats( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_stats *sysfs_stats; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; + sysfs_stats->nr_tried = scheme->stat.nr_tried; + sysfs_stats->sz_tried = scheme->stat.sz_tried; + sysfs_stats->nr_applied = scheme->stat.nr_applied; + sysfs_stats->sz_applied = scheme->stat.sz_applied; + sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; + } +} + +/* + * damon_sysfs_schemes that need to update its schemes regions dir. Protected + * by damon_sysfs_lock + */ +static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; +static int damon_sysfs_schemes_region_idx; + +/* + * DAMON callback that called before damos apply. While this callback is + * registered, damon_sysfs_lock should be held to ensure the regions + * directories exist. + */ +static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos *scheme; + struct damon_sysfs_scheme_regions *sysfs_regions; + struct damon_sysfs_scheme_region *region; + struct damon_sysfs_schemes *sysfs_schemes = + damon_sysfs_schemes_for_damos_callback; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + if (scheme == s) + break; + schemes_idx++; + } + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + return 0; + + sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + region = damon_sysfs_scheme_region_alloc(r); + list_add_tail(®ion->list, &sysfs_regions->regions_list); + sysfs_regions->nr_regions++; + if (kobject_init_and_add(®ion->kobj, + &damon_sysfs_scheme_region_ktype, + &sysfs_regions->kobj, "%d", + damon_sysfs_schemes_region_idx++)) { + kobject_put(®ion->kobj); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_clear_regions( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + struct damos *scheme; + int schemes_idx = 0; + + damon_for_each_scheme(scheme, ctx) { + struct damon_sysfs_scheme *sysfs_scheme; + + /* user could have removed the scheme sysfs dir */ + if (schemes_idx >= sysfs_schemes->nr) + break; + + sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; + damon_sysfs_scheme_regions_rm_dirs( + sysfs_scheme->tried_regions); + } + return 0; +} + +/* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ +int damon_sysfs_schemes_update_regions_start( + struct damon_sysfs_schemes *sysfs_schemes, + struct damon_ctx *ctx) +{ + damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); + damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; + return 0; +} + +/* + * Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock. Caller + * should unlock damon_sysfs_lock which held before + * damon_sysfs_schemes_update_regions_start() + */ +int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx) +{ + damon_sysfs_schemes_for_damos_callback = NULL; + ctx->callback.before_damos_apply = NULL; + damon_sysfs_schemes_region_idx = 0; + return 0; +} diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 9f1219a67e3f..aeb0beb1da91 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -5,1056 +5,11 @@ * Copyright (c) 2022 SeongJae Park <sj@kernel.org> */ -#include <linux/damon.h> -#include <linux/kobject.h> #include <linux/pid.h> #include <linux/sched.h> #include <linux/slab.h> -static DEFINE_MUTEX(damon_sysfs_lock); - -/* - * unsigned long range directory - */ - -struct damon_sysfs_ul_range { - struct kobject kobj; - unsigned long min; - unsigned long max; -}; - -static struct damon_sysfs_ul_range *damon_sysfs_ul_range_alloc( - unsigned long min, - unsigned long max) -{ - struct damon_sysfs_ul_range *range = kmalloc(sizeof(*range), - GFP_KERNEL); - - if (!range) - return NULL; - range->kobj = (struct kobject){}; - range->min = min; - range->max = max; - - return range; -} - -static ssize_t min_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->min); -} - -static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long min; - int err; - - err = kstrtoul(buf, 0, &min); - if (err) - return err; - - range->min = min; - return count; -} - -static ssize_t max_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - - return sysfs_emit(buf, "%lu\n", range->max); -} - -static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_ul_range *range = container_of(kobj, - struct damon_sysfs_ul_range, kobj); - unsigned long max; - int err; - - err = kstrtoul(buf, 0, &max); - if (err) - return err; - - range->max = max; - return count; -} - -static void damon_sysfs_ul_range_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_ul_range, kobj)); -} - -static struct kobj_attribute damon_sysfs_ul_range_min_attr = - __ATTR_RW_MODE(min, 0600); - -static struct kobj_attribute damon_sysfs_ul_range_max_attr = - __ATTR_RW_MODE(max, 0600); - -static struct attribute *damon_sysfs_ul_range_attrs[] = { - &damon_sysfs_ul_range_min_attr.attr, - &damon_sysfs_ul_range_max_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_ul_range); - -static struct kobj_type damon_sysfs_ul_range_ktype = { - .release = damon_sysfs_ul_range_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_ul_range_groups, -}; - -/* - * schemes/stats directory - */ - -struct damon_sysfs_stats { - struct kobject kobj; - unsigned long nr_tried; - unsigned long sz_tried; - unsigned long nr_applied; - unsigned long sz_applied; - unsigned long qt_exceeds; -}; - -static struct damon_sysfs_stats *damon_sysfs_stats_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_stats), GFP_KERNEL); -} - -static ssize_t nr_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_tried); -} - -static ssize_t sz_tried_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_tried); -} - -static ssize_t nr_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->nr_applied); -} - -static ssize_t sz_applied_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->sz_applied); -} - -static ssize_t qt_exceeds_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_stats *stats = container_of(kobj, - struct damon_sysfs_stats, kobj); - - return sysfs_emit(buf, "%lu\n", stats->qt_exceeds); -} - -static void damon_sysfs_stats_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_stats, kobj)); -} - -static struct kobj_attribute damon_sysfs_stats_nr_tried_attr = - __ATTR_RO_MODE(nr_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_tried_attr = - __ATTR_RO_MODE(sz_tried, 0400); - -static struct kobj_attribute damon_sysfs_stats_nr_applied_attr = - __ATTR_RO_MODE(nr_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_sz_applied_attr = - __ATTR_RO_MODE(sz_applied, 0400); - -static struct kobj_attribute damon_sysfs_stats_qt_exceeds_attr = - __ATTR_RO_MODE(qt_exceeds, 0400); - -static struct attribute *damon_sysfs_stats_attrs[] = { - &damon_sysfs_stats_nr_tried_attr.attr, - &damon_sysfs_stats_sz_tried_attr.attr, - &damon_sysfs_stats_nr_applied_attr.attr, - &damon_sysfs_stats_sz_applied_attr.attr, - &damon_sysfs_stats_qt_exceeds_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_stats); - -static struct kobj_type damon_sysfs_stats_ktype = { - .release = damon_sysfs_stats_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_stats_groups, -}; - -/* - * watermarks directory - */ - -struct damon_sysfs_watermarks { - struct kobject kobj; - enum damos_wmark_metric metric; - unsigned long interval_us; - unsigned long high; - unsigned long mid; - unsigned long low; -}; - -static struct damon_sysfs_watermarks *damon_sysfs_watermarks_alloc( - enum damos_wmark_metric metric, unsigned long interval_us, - unsigned long high, unsigned long mid, unsigned long low) -{ - struct damon_sysfs_watermarks *watermarks = kmalloc( - sizeof(*watermarks), GFP_KERNEL); - - if (!watermarks) - return NULL; - watermarks->kobj = (struct kobject){}; - watermarks->metric = metric; - watermarks->interval_us = interval_us; - watermarks->high = high; - watermarks->mid = mid; - watermarks->low = low; - return watermarks; -} - -/* Should match with enum damos_wmark_metric */ -static const char * const damon_sysfs_wmark_metric_strs[] = { - "none", - "free_mem_rate", -}; - -static ssize_t metric_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_wmark_metric_strs[watermarks->metric]); -} - -static ssize_t metric_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - enum damos_wmark_metric metric; - - for (metric = 0; metric < NR_DAMOS_WMARK_METRICS; metric++) { - if (sysfs_streq(buf, damon_sysfs_wmark_metric_strs[metric])) { - watermarks->metric = metric; - return count; - } - } - return -EINVAL; -} - -static ssize_t interval_us_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->interval_us); -} - -static ssize_t interval_us_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->interval_us); - - return err ? err : count; -} - -static ssize_t high_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->high); -} - -static ssize_t high_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->high); - - return err ? err : count; -} - -static ssize_t mid_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->mid); -} - -static ssize_t mid_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->mid); - - return err ? err : count; -} - -static ssize_t low_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - - return sysfs_emit(buf, "%lu\n", watermarks->low); -} - -static ssize_t low_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_watermarks *watermarks = container_of(kobj, - struct damon_sysfs_watermarks, kobj); - int err = kstrtoul(buf, 0, &watermarks->low); - - return err ? err : count; -} - -static void damon_sysfs_watermarks_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_watermarks, kobj)); -} - -static struct kobj_attribute damon_sysfs_watermarks_metric_attr = - __ATTR_RW_MODE(metric, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_interval_us_attr = - __ATTR_RW_MODE(interval_us, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_high_attr = - __ATTR_RW_MODE(high, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_mid_attr = - __ATTR_RW_MODE(mid, 0600); - -static struct kobj_attribute damon_sysfs_watermarks_low_attr = - __ATTR_RW_MODE(low, 0600); - -static struct attribute *damon_sysfs_watermarks_attrs[] = { - &damon_sysfs_watermarks_metric_attr.attr, - &damon_sysfs_watermarks_interval_us_attr.attr, - &damon_sysfs_watermarks_high_attr.attr, - &damon_sysfs_watermarks_mid_attr.attr, - &damon_sysfs_watermarks_low_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_watermarks); - -static struct kobj_type damon_sysfs_watermarks_ktype = { - .release = damon_sysfs_watermarks_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_watermarks_groups, -}; - -/* - * scheme/weights directory - */ - -struct damon_sysfs_weights { - struct kobject kobj; - unsigned int sz; - unsigned int nr_accesses; - unsigned int age; -}; - -static struct damon_sysfs_weights *damon_sysfs_weights_alloc(unsigned int sz, - unsigned int nr_accesses, unsigned int age) -{ - struct damon_sysfs_weights *weights = kmalloc(sizeof(*weights), - GFP_KERNEL); - - if (!weights) - return NULL; - weights->kobj = (struct kobject){}; - weights->sz = sz; - weights->nr_accesses = nr_accesses; - weights->age = age; - return weights; -} - -static ssize_t sz_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->sz); -} - -static ssize_t sz_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->sz); - - return err ? err : count; -} - -static ssize_t nr_accesses_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->nr_accesses); -} - -static ssize_t nr_accesses_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->nr_accesses); - - return err ? err : count; -} - -static ssize_t age_permil_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - - return sysfs_emit(buf, "%u\n", weights->age); -} - -static ssize_t age_permil_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_weights *weights = container_of(kobj, - struct damon_sysfs_weights, kobj); - int err = kstrtouint(buf, 0, &weights->age); - - return err ? err : count; -} - -static void damon_sysfs_weights_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_weights, kobj)); -} - -static struct kobj_attribute damon_sysfs_weights_sz_attr = - __ATTR_RW_MODE(sz_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_nr_accesses_attr = - __ATTR_RW_MODE(nr_accesses_permil, 0600); - -static struct kobj_attribute damon_sysfs_weights_age_attr = - __ATTR_RW_MODE(age_permil, 0600); - -static struct attribute *damon_sysfs_weights_attrs[] = { - &damon_sysfs_weights_sz_attr.attr, - &damon_sysfs_weights_nr_accesses_attr.attr, - &damon_sysfs_weights_age_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_weights); - -static struct kobj_type damon_sysfs_weights_ktype = { - .release = damon_sysfs_weights_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_weights_groups, -}; - -/* - * quotas directory - */ - -struct damon_sysfs_quotas { - struct kobject kobj; - struct damon_sysfs_weights *weights; - unsigned long ms; - unsigned long sz; - unsigned long reset_interval_ms; -}; - -static struct damon_sysfs_quotas *damon_sysfs_quotas_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_quotas), GFP_KERNEL); -} - -static int damon_sysfs_quotas_add_dirs(struct damon_sysfs_quotas *quotas) -{ - struct damon_sysfs_weights *weights; - int err; - - weights = damon_sysfs_weights_alloc(0, 0, 0); - if (!weights) - return -ENOMEM; - - err = kobject_init_and_add(&weights->kobj, &damon_sysfs_weights_ktype, - "as->kobj, "weights"); - if (err) - kobject_put(&weights->kobj); - else - quotas->weights = weights; - return err; -} - -static void damon_sysfs_quotas_rm_dirs(struct damon_sysfs_quotas *quotas) -{ - kobject_put("as->weights->kobj); -} - -static ssize_t ms_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->ms); -} - -static ssize_t ms_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->ms); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t bytes_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->sz); -} - -static ssize_t bytes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->sz); - - if (err) - return -EINVAL; - return count; -} - -static ssize_t reset_interval_ms_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - - return sysfs_emit(buf, "%lu\n", quotas->reset_interval_ms); -} - -static ssize_t reset_interval_ms_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_quotas *quotas = container_of(kobj, - struct damon_sysfs_quotas, kobj); - int err = kstrtoul(buf, 0, "as->reset_interval_ms); - - if (err) - return -EINVAL; - return count; -} - -static void damon_sysfs_quotas_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_quotas, kobj)); -} - -static struct kobj_attribute damon_sysfs_quotas_ms_attr = - __ATTR_RW_MODE(ms, 0600); - -static struct kobj_attribute damon_sysfs_quotas_sz_attr = - __ATTR_RW_MODE(bytes, 0600); - -static struct kobj_attribute damon_sysfs_quotas_reset_interval_ms_attr = - __ATTR_RW_MODE(reset_interval_ms, 0600); - -static struct attribute *damon_sysfs_quotas_attrs[] = { - &damon_sysfs_quotas_ms_attr.attr, - &damon_sysfs_quotas_sz_attr.attr, - &damon_sysfs_quotas_reset_interval_ms_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_quotas); - -static struct kobj_type damon_sysfs_quotas_ktype = { - .release = damon_sysfs_quotas_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_quotas_groups, -}; - -/* - * access_pattern directory - */ - -struct damon_sysfs_access_pattern { - struct kobject kobj; - struct damon_sysfs_ul_range *sz; - struct damon_sysfs_ul_range *nr_accesses; - struct damon_sysfs_ul_range *age; -}; - -static -struct damon_sysfs_access_pattern *damon_sysfs_access_pattern_alloc(void) -{ - struct damon_sysfs_access_pattern *access_pattern = - kmalloc(sizeof(*access_pattern), GFP_KERNEL); - - if (!access_pattern) - return NULL; - access_pattern->kobj = (struct kobject){}; - return access_pattern; -} - -static int damon_sysfs_access_pattern_add_range_dir( - struct damon_sysfs_access_pattern *access_pattern, - struct damon_sysfs_ul_range **range_dir_ptr, - char *name) -{ - struct damon_sysfs_ul_range *range = damon_sysfs_ul_range_alloc(0, 0); - int err; - - if (!range) - return -ENOMEM; - err = kobject_init_and_add(&range->kobj, &damon_sysfs_ul_range_ktype, - &access_pattern->kobj, name); - if (err) - kobject_put(&range->kobj); - else - *range_dir_ptr = range; - return err; -} - -static int damon_sysfs_access_pattern_add_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - int err; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->sz, "sz"); - if (err) - goto put_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->nr_accesses, "nr_accesses"); - if (err) - goto put_nr_accesses_sz_out; - - err = damon_sysfs_access_pattern_add_range_dir(access_pattern, - &access_pattern->age, "age"); - if (err) - goto put_age_nr_accesses_sz_out; - return 0; - -put_age_nr_accesses_sz_out: - kobject_put(&access_pattern->age->kobj); - access_pattern->age = NULL; -put_nr_accesses_sz_out: - kobject_put(&access_pattern->nr_accesses->kobj); - access_pattern->nr_accesses = NULL; -put_sz_out: - kobject_put(&access_pattern->sz->kobj); - access_pattern->sz = NULL; - return err; -} - -static void damon_sysfs_access_pattern_rm_dirs( - struct damon_sysfs_access_pattern *access_pattern) -{ - kobject_put(&access_pattern->sz->kobj); - kobject_put(&access_pattern->nr_accesses->kobj); - kobject_put(&access_pattern->age->kobj); -} - -static void damon_sysfs_access_pattern_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_access_pattern, kobj)); -} - -static struct attribute *damon_sysfs_access_pattern_attrs[] = { - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_access_pattern); - -static struct kobj_type damon_sysfs_access_pattern_ktype = { - .release = damon_sysfs_access_pattern_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_access_pattern_groups, -}; - -/* - * scheme directory - */ - -struct damon_sysfs_scheme { - struct kobject kobj; - enum damos_action action; - struct damon_sysfs_access_pattern *access_pattern; - struct damon_sysfs_quotas *quotas; - struct damon_sysfs_watermarks *watermarks; - struct damon_sysfs_stats *stats; -}; - -/* This should match with enum damos_action */ -static const char * const damon_sysfs_damos_action_strs[] = { - "willneed", - "cold", - "pageout", - "hugepage", - "nohugepage", - "lru_prio", - "lru_deprio", - "stat", -}; - -static struct damon_sysfs_scheme *damon_sysfs_scheme_alloc( - enum damos_action action) -{ - struct damon_sysfs_scheme *scheme = kmalloc(sizeof(*scheme), - GFP_KERNEL); - - if (!scheme) - return NULL; - scheme->kobj = (struct kobject){}; - scheme->action = action; - return scheme; -} - -static int damon_sysfs_scheme_set_access_pattern( - struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_access_pattern *access_pattern; - int err; - - access_pattern = damon_sysfs_access_pattern_alloc(); - if (!access_pattern) - return -ENOMEM; - err = kobject_init_and_add(&access_pattern->kobj, - &damon_sysfs_access_pattern_ktype, &scheme->kobj, - "access_pattern"); - if (err) - goto out; - err = damon_sysfs_access_pattern_add_dirs(access_pattern); - if (err) - goto out; - scheme->access_pattern = access_pattern; - return 0; - -out: - kobject_put(&access_pattern->kobj); - return err; -} - -static int damon_sysfs_scheme_set_quotas(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_quotas *quotas = damon_sysfs_quotas_alloc(); - int err; - - if (!quotas) - return -ENOMEM; - err = kobject_init_and_add("as->kobj, &damon_sysfs_quotas_ktype, - &scheme->kobj, "quotas"); - if (err) - goto out; - err = damon_sysfs_quotas_add_dirs(quotas); - if (err) - goto out; - scheme->quotas = quotas; - return 0; - -out: - kobject_put("as->kobj); - return err; -} - -static int damon_sysfs_scheme_set_watermarks(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_watermarks *watermarks = - damon_sysfs_watermarks_alloc(DAMOS_WMARK_NONE, 0, 0, 0, 0); - int err; - - if (!watermarks) - return -ENOMEM; - err = kobject_init_and_add(&watermarks->kobj, - &damon_sysfs_watermarks_ktype, &scheme->kobj, - "watermarks"); - if (err) - kobject_put(&watermarks->kobj); - else - scheme->watermarks = watermarks; - return err; -} - -static int damon_sysfs_scheme_set_stats(struct damon_sysfs_scheme *scheme) -{ - struct damon_sysfs_stats *stats = damon_sysfs_stats_alloc(); - int err; - - if (!stats) - return -ENOMEM; - err = kobject_init_and_add(&stats->kobj, &damon_sysfs_stats_ktype, - &scheme->kobj, "stats"); - if (err) - kobject_put(&stats->kobj); - else - scheme->stats = stats; - return err; -} - -static int damon_sysfs_scheme_add_dirs(struct damon_sysfs_scheme *scheme) -{ - int err; - - err = damon_sysfs_scheme_set_access_pattern(scheme); - if (err) - return err; - err = damon_sysfs_scheme_set_quotas(scheme); - if (err) - goto put_access_pattern_out; - err = damon_sysfs_scheme_set_watermarks(scheme); - if (err) - goto put_quotas_access_pattern_out; - err = damon_sysfs_scheme_set_stats(scheme); - if (err) - goto put_watermarks_quotas_access_pattern_out; - return 0; - -put_watermarks_quotas_access_pattern_out: - kobject_put(&scheme->watermarks->kobj); - scheme->watermarks = NULL; -put_quotas_access_pattern_out: - kobject_put(&scheme->quotas->kobj); - scheme->quotas = NULL; -put_access_pattern_out: - kobject_put(&scheme->access_pattern->kobj); - scheme->access_pattern = NULL; - return err; -} - -static void damon_sysfs_scheme_rm_dirs(struct damon_sysfs_scheme *scheme) -{ - damon_sysfs_access_pattern_rm_dirs(scheme->access_pattern); - kobject_put(&scheme->access_pattern->kobj); - damon_sysfs_quotas_rm_dirs(scheme->quotas); - kobject_put(&scheme->quotas->kobj); - kobject_put(&scheme->watermarks->kobj); - kobject_put(&scheme->stats->kobj); -} - -static ssize_t action_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - - return sysfs_emit(buf, "%s\n", - damon_sysfs_damos_action_strs[scheme->action]); -} - -static ssize_t action_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct damon_sysfs_scheme *scheme = container_of(kobj, - struct damon_sysfs_scheme, kobj); - enum damos_action action; - - for (action = 0; action < NR_DAMOS_ACTIONS; action++) { - if (sysfs_streq(buf, damon_sysfs_damos_action_strs[action])) { - scheme->action = action; - return count; - } - } - return -EINVAL; -} - -static void damon_sysfs_scheme_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_scheme, kobj)); -} - -static struct kobj_attribute damon_sysfs_scheme_action_attr = - __ATTR_RW_MODE(action, 0600); - -static struct attribute *damon_sysfs_scheme_attrs[] = { - &damon_sysfs_scheme_action_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_scheme); - -static struct kobj_type damon_sysfs_scheme_ktype = { - .release = damon_sysfs_scheme_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_scheme_groups, -}; - -/* - * schemes directory - */ - -struct damon_sysfs_schemes { - struct kobject kobj; - struct damon_sysfs_scheme **schemes_arr; - int nr; -}; - -static struct damon_sysfs_schemes *damon_sysfs_schemes_alloc(void) -{ - return kzalloc(sizeof(struct damon_sysfs_schemes), GFP_KERNEL); -} - -static void damon_sysfs_schemes_rm_dirs(struct damon_sysfs_schemes *schemes) -{ - struct damon_sysfs_scheme **schemes_arr = schemes->schemes_arr; - int i; - - for (i = 0; i < schemes->nr; i++) { - damon_sysfs_scheme_rm_dirs(schemes_arr[i]); - kobject_put(&schemes_arr[i]->kobj); - } - schemes->nr = 0; - kfree(schemes_arr); - schemes->schemes_arr = NULL; -} - -static int damon_sysfs_schemes_add_dirs(struct damon_sysfs_schemes *schemes, - int nr_schemes) -{ - struct damon_sysfs_scheme **schemes_arr, *scheme; - int err, i; - - damon_sysfs_schemes_rm_dirs(schemes); - if (!nr_schemes) - return 0; - - schemes_arr = kmalloc_array(nr_schemes, sizeof(*schemes_arr), - GFP_KERNEL | __GFP_NOWARN); - if (!schemes_arr) - return -ENOMEM; - schemes->schemes_arr = schemes_arr; - - for (i = 0; i < nr_schemes; i++) { - scheme = damon_sysfs_scheme_alloc(DAMOS_STAT); - if (!scheme) { - damon_sysfs_schemes_rm_dirs(schemes); - return -ENOMEM; - } - - err = kobject_init_and_add(&scheme->kobj, - &damon_sysfs_scheme_ktype, &schemes->kobj, - "%d", i); - if (err) - goto out; - err = damon_sysfs_scheme_add_dirs(scheme); - if (err) - goto out; - - schemes_arr[i] = scheme; - schemes->nr++; - } - return 0; - -out: - damon_sysfs_schemes_rm_dirs(schemes); - kobject_put(&scheme->kobj); - return err; -} - -static ssize_t nr_schemes_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); - - return sysfs_emit(buf, "%d\n", schemes->nr); -} - -static ssize_t nr_schemes_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - struct damon_sysfs_schemes *schemes; - int nr, err = kstrtoint(buf, 0, &nr); - - if (err) - return err; - if (nr < 0) - return -EINVAL; - - schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); - - if (!mutex_trylock(&damon_sysfs_lock)) - return -EBUSY; - err = damon_sysfs_schemes_add_dirs(schemes, nr); - mutex_unlock(&damon_sysfs_lock); - if (err) - return err; - return count; -} - -static void damon_sysfs_schemes_release(struct kobject *kobj) -{ - kfree(container_of(kobj, struct damon_sysfs_schemes, kobj)); -} - -static struct kobj_attribute damon_sysfs_schemes_nr_attr = - __ATTR_RW_MODE(nr_schemes, 0600); - -static struct attribute *damon_sysfs_schemes_attrs[] = { - &damon_sysfs_schemes_nr_attr.attr, - NULL, -}; -ATTRIBUTE_GROUPS(damon_sysfs_schemes); - -static struct kobj_type damon_sysfs_schemes_ktype = { - .release = damon_sysfs_schemes_release, - .sysfs_ops = &kobj_sysfs_ops, - .default_groups = damon_sysfs_schemes_groups, -}; +#include "sysfs-common.h" /* * init region directory @@ -1062,23 +17,12 @@ static struct kobj_type damon_sysfs_schemes_ktype = { struct damon_sysfs_region { struct kobject kobj; - unsigned long start; - unsigned long end; + struct damon_addr_range ar; }; -static struct damon_sysfs_region *damon_sysfs_region_alloc( - unsigned long start, - unsigned long end) +static struct damon_sysfs_region *damon_sysfs_region_alloc(void) { - struct damon_sysfs_region *region = kmalloc(sizeof(*region), - GFP_KERNEL); - - if (!region) - return NULL; - region->kobj = (struct kobject){}; - region->start = start; - region->end = end; - return region; + return kzalloc(sizeof(struct damon_sysfs_region), GFP_KERNEL); } static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1087,7 +31,7 @@ static ssize_t start_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->start); + return sysfs_emit(buf, "%lu\n", region->ar.start); } static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1095,7 +39,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->start); + int err = kstrtoul(buf, 0, ®ion->ar.start); return err ? err : count; } @@ -1106,7 +50,7 @@ static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - return sysfs_emit(buf, "%lu\n", region->end); + return sysfs_emit(buf, "%lu\n", region->ar.end); } static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -1114,7 +58,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, { struct damon_sysfs_region *region = container_of(kobj, struct damon_sysfs_region, kobj); - int err = kstrtoul(buf, 0, ®ion->end); + int err = kstrtoul(buf, 0, ®ion->ar.end); return err ? err : count; } @@ -1187,7 +131,7 @@ static int damon_sysfs_regions_add_dirs(struct damon_sysfs_regions *regions, regions->regions_arr = regions_arr; for (i = 0; i < nr_regions; i++) { - region = damon_sysfs_region_alloc(0, 0); + region = damon_sysfs_region_alloc(); if (!region) { damon_sysfs_regions_rm_dirs(regions); return -ENOMEM; @@ -2056,6 +1000,16 @@ enum damon_sysfs_cmd { */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried + * regions + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS, + /* + * @DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: Clear schemes tried + * regions + */ + DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS, + /* * @NR_DAMON_SYSFS_CMDS: Total number of DAMON sysfs commands. */ NR_DAMON_SYSFS_CMDS, @@ -2067,6 +1021,8 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_regions", + "clear_schemes_tried_regions", }; /* @@ -2147,11 +1103,11 @@ static int damon_sysfs_set_regions(struct damon_target *t, struct damon_sysfs_region *sys_region = sysfs_regions->regions_arr[i]; - if (sys_region->start > sys_region->end) + if (sys_region->ar.start > sys_region->ar.end) goto out; - ranges[i].start = sys_region->start; - ranges[i].end = sys_region->end; + ranges[i].start = sys_region->ar.start; + ranges[i].end = sys_region->ar.end; if (i == 0) continue; if (ranges[i - 1].end > ranges[i].start) @@ -2246,65 +1202,19 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, return 0; } -static struct damos *damon_sysfs_mk_scheme( - struct damon_sysfs_scheme *sysfs_scheme) -{ - struct damon_sysfs_access_pattern *access_pattern = - sysfs_scheme->access_pattern; - struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; - struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; - struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; - - struct damos_access_pattern pattern = { - .min_sz_region = access_pattern->sz->min, - .max_sz_region = access_pattern->sz->max, - .min_nr_accesses = access_pattern->nr_accesses->min, - .max_nr_accesses = access_pattern->nr_accesses->max, - .min_age_region = access_pattern->age->min, - .max_age_region = access_pattern->age->max, - }; - struct damos_quota quota = { - .ms = sysfs_quotas->ms, - .sz = sysfs_quotas->sz, - .reset_interval = sysfs_quotas->reset_interval_ms, - .weight_sz = sysfs_weights->sz, - .weight_nr_accesses = sysfs_weights->nr_accesses, - .weight_age = sysfs_weights->age, - }; - struct damos_watermarks wmarks = { - .metric = sysfs_wmarks->metric, - .interval = sysfs_wmarks->interval_us, - .high = sysfs_wmarks->high, - .mid = sysfs_wmarks->mid, - .low = sysfs_wmarks->low, - }; - - return damon_new_scheme(&pattern, sysfs_scheme->action, "a, - &wmarks); -} - -static int damon_sysfs_set_schemes(struct damon_ctx *ctx, - struct damon_sysfs_schemes *sysfs_schemes) -{ - int i; - - for (i = 0; i < sysfs_schemes->nr; i++) { - struct damos *scheme, *next; - - scheme = damon_sysfs_mk_scheme(sysfs_schemes->schemes_arr[i]); - if (!scheme) { - damon_for_each_scheme_safe(scheme, next, ctx) - damon_destroy_scheme(scheme); - return -ENOMEM; - } - damon_add_scheme(ctx, scheme); - } - return 0; -} - static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; + struct damon_sysfs_kdamond *kdamond; + + /* damon_sysfs_schemes_update_regions_stop() might not yet called */ + kdamond = damon_sysfs_cmd_request.kdamond; + if (kdamond && damon_sysfs_cmd_request.cmd == + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && + ctx == kdamond->damon_ctx) { + damon_sysfs_schemes_update_regions_stop(ctx); + mutex_unlock(&damon_sysfs_lock); + } if (!damon_target_has_pid(ctx)) return; @@ -2329,26 +1239,46 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) { struct damon_ctx *ctx = kdamond->damon_ctx; - struct damon_sysfs_schemes *sysfs_schemes; - struct damos *scheme; - int schemes_idx = 0; if (!ctx) return -EINVAL; - sysfs_schemes = kdamond->contexts->contexts_arr[0]->schemes; - damon_for_each_scheme(scheme, ctx) { - struct damon_sysfs_stats *sysfs_stats; - - sysfs_stats = sysfs_schemes->schemes_arr[schemes_idx++]->stats; - sysfs_stats->nr_tried = scheme->stat.nr_tried; - sysfs_stats->sz_tried = scheme->stat.sz_tried; - sysfs_stats->nr_applied = scheme->stat.nr_applied; - sysfs_stats->sz_applied = scheme->stat.sz_applied; - sysfs_stats->qt_exceeds = scheme->stat.qt_exceeds; - } + damon_sysfs_schemes_update_stats( + kdamond->contexts->contexts_arr[0]->schemes, ctx); return 0; } +static int damon_sysfs_upd_schemes_regions_start( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_start( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + +static int damon_sysfs_upd_schemes_regions_stop( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_update_regions_stop(ctx); +} + +static int damon_sysfs_clear_schemes_regions( + struct damon_sysfs_kdamond *kdamond) +{ + struct damon_ctx *ctx = kdamond->damon_ctx; + + if (!ctx) + return -EINVAL; + return damon_sysfs_schemes_clear_regions( + kdamond->contexts->contexts_arr[0]->schemes, ctx); +} + static inline bool damon_sysfs_kdamond_running( struct damon_sysfs_kdamond *kdamond) { @@ -2401,10 +1331,12 @@ static int damon_sysfs_commit_input(struct damon_sysfs_kdamond *kdamond) static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; + static bool damon_sysfs_schemes_regions_updating; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ - if (!mutex_trylock(&damon_sysfs_lock)) + if (!damon_sysfs_schemes_regions_updating && + !mutex_trylock(&damon_sysfs_lock)) return 0; kdamond = damon_sysfs_cmd_request.kdamond; if (!kdamond || kdamond->damon_ctx != c) @@ -2416,13 +1348,30 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: + if (!damon_sysfs_schemes_regions_updating) { + err = damon_sysfs_upd_schemes_regions_start(kdamond); + if (!err) { + damon_sysfs_schemes_regions_updating = true; + goto keep_lock_out; + } + } else { + err = damon_sysfs_upd_schemes_regions_stop(kdamond); + damon_sysfs_schemes_regions_updating = false; + } + break; + case DAMON_SYSFS_CMD_CLEAR_SCHEMES_TRIED_REGIONS: + err = damon_sysfs_clear_schemes_regions(kdamond); + break; default: break; } /* Mark the request as invalid now. */ damon_sysfs_cmd_request.kdamond = NULL; out: - mutex_unlock(&damon_sysfs_lock); + if (!damon_sysfs_schemes_regions_updating) + mutex_unlock(&damon_sysfs_lock); +keep_lock_out: return err; } diff --git a/mm/debug.c b/mm/debug.c index 0fd15ba70d16..7f8e5f744e42 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -94,9 +94,10 @@ static void __dump_page(struct page *page) page, page_ref_count(head), mapcount, mapping, page_to_pgoff(page), page_to_pfn(page)); if (compound) { - pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n", + pr_warn("head:%p order:%u compound_mapcount:%d subpages_mapcount:%d compound_pincount:%d\n", head, compound_order(head), - folio_entire_mapcount(folio), + head_compound_mapcount(head), + head_subpages_mapcount(head), head_compound_pincount(head)); } diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index dc7df1254f0a..c631ade3f1d2 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -38,11 +38,7 @@ * Please refer Documentation/mm/arch_pgtable_helpers.rst for the semantics * expectations that are being validated here. All future changes in here * or the documentation need to be in sync. - */ - -#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC) - -/* + * * On s390 platform, the lower 4 bits are used to identify given page table * entry type. But these bits might affect the ability to clear entries with * pxx_clear() because of how dynamic page table folding works on s390. So @@ -175,18 +171,6 @@ static void __init pte_advanced_tests(struct pgtable_debug_args *args) ptep_get_and_clear_full(args->mm, args->vaddr, args->ptep, 1); } -static void __init pte_savedwrite_tests(struct pgtable_debug_args *args) -{ - pte_t pte = pfn_pte(args->fixed_pte_pfn, args->page_prot_none); - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - pr_debug("Validating PTE saved write\n"); - WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte)))); - WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte)))); -} - #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void __init pmd_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -306,22 +290,6 @@ static void __init pmd_leaf_tests(struct pgtable_debug_args *args) WARN_ON(!pmd_leaf(pmd)); } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) -{ - pmd_t pmd; - - if (!IS_ENABLED(CONFIG_NUMA_BALANCING)) - return; - - if (!has_transparent_hugepage()) - return; - - pr_debug("Validating PMD saved write\n"); - pmd = pfn_pmd(args->fixed_pmd_pfn, args->page_prot_none); - WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd)))); - WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd)))); -} - #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) { @@ -455,7 +423,6 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args) { } static void __init pud_advanced_tests(struct pgtable_debug_args *args) { } static void __init pmd_leaf_tests(struct pgtable_debug_args *args) { } static void __init pud_leaf_tests(struct pgtable_debug_args *args) { } -static void __init pmd_savedwrite_tests(struct pgtable_debug_args *args) { } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP @@ -1125,7 +1092,7 @@ static int __init init_args(struct pgtable_debug_args *args) */ memset(args, 0, sizeof(*args)); args->vaddr = get_random_vaddr(); - args->page_prot = vm_get_page_prot(VMFLAGS); + args->page_prot = vm_get_page_prot(VM_ACCESS_FLAGS); args->page_prot_none = vm_get_page_prot(VM_NONE); args->is_contiguous_page = false; args->pud_pfn = ULONG_MAX; @@ -1292,9 +1259,6 @@ static int __init debug_vm_pgtable(void) pmd_leaf_tests(&args); pud_leaf_tests(&args); - pte_savedwrite_tests(&args); - pmd_savedwrite_tests(&args); - pte_special_tests(&args); pte_protnone_tests(&args); pmd_protnone_tests(&args); diff --git a/mm/fadvise.c b/mm/fadvise.c index c76ee665355a..bf04fec87f35 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -72,7 +72,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) */ endbyte = (u64)offset + (u64)len; if (!len || endbyte < len) - endbyte = -1; + endbyte = LLONG_MAX; else endbyte--; /* inclusive */ diff --git a/mm/failslab.c b/mm/failslab.c index 58df9789f1d2..ffc420c0e767 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -16,6 +16,8 @@ static struct { bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) { + int flags = 0; + /* No fault-injection for bootstrap cache */ if (unlikely(s == kmem_cache)) return false; @@ -30,10 +32,16 @@ bool __should_failslab(struct kmem_cache *s, gfp_t gfpflags) if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB)) return false; + /* + * In some cases, it expects to specify __GFP_NOWARN + * to avoid printing any information(not just a warning), + * thus avoiding deadlocks. See commit 6b9dbedbe349 for + * details. + */ if (gfpflags & __GFP_NOWARN) - failslab.attr.no_warn = true; + flags |= FAULT_NOWARN; - return should_fail(&failslab.attr, s->object_size); + return should_fail_ex(&failslab.attr, s->object_size, flags); } static int __init setup_failslab(char *str) diff --git a/mm/filemap.c b/mm/filemap.c index 08341616ae7a..c4d4ace9cc70 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -506,9 +506,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping, struct pagevec pvec; int nr_pages; - if (end_byte < start_byte) - return; - pagevec_init(&pvec); while (index <= end) { unsigned i; @@ -670,6 +667,9 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0, err2; + if (lend < lstart) + return 0; + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -770,6 +770,9 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) int err = 0, err2; struct address_space *mapping = file->f_mapping; + if (lend < lstart) + return 0; + if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -785,56 +788,54 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) EXPORT_SYMBOL(file_write_and_wait_range); /** - * replace_page_cache_page - replace a pagecache page with a new one - * @old: page to be replaced - * @new: page to replace with - * - * This function replaces a page in the pagecache with a new one. On - * success it acquires the pagecache reference for the new page and - * drops it for the old page. Both the old and new pages must be - * locked. This function does not add the new page to the LRU, the + * replace_page_cache_folio - replace a pagecache folio with a new one + * @old: folio to be replaced + * @new: folio to replace with + * + * This function replaces a folio in the pagecache with a new one. On + * success it acquires the pagecache reference for the new folio and + * drops it for the old folio. Both the old and new folios must be + * locked. This function does not add the new folio to the LRU, the * caller must do that. * * The remove + add is atomic. This function cannot fail. */ -void replace_page_cache_page(struct page *old, struct page *new) +void replace_page_cache_folio(struct folio *old, struct folio *new) { - struct folio *fold = page_folio(old); - struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); - VM_BUG_ON_PAGE(!PageLocked(old), old); - VM_BUG_ON_PAGE(!PageLocked(new), new); - VM_BUG_ON_PAGE(new->mapping, new); + VM_BUG_ON_FOLIO(!folio_test_locked(old), old); + VM_BUG_ON_FOLIO(!folio_test_locked(new), new); + VM_BUG_ON_FOLIO(new->mapping, new); - get_page(new); + folio_get(new); new->mapping = mapping; new->index = offset; - mem_cgroup_migrate(fold, fnew); + mem_cgroup_migrate(old, new); xas_lock_irq(&xas); xas_store(&xas, new); old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(old)) - __dec_lruvec_page_state(old, NR_FILE_PAGES); - if (!PageHuge(new)) - __inc_lruvec_page_state(new, NR_FILE_PAGES); - if (PageSwapBacked(old)) - __dec_lruvec_page_state(old, NR_SHMEM); - if (PageSwapBacked(new)) - __inc_lruvec_page_state(new, NR_SHMEM); + if (!folio_test_hugetlb(old)) + __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + if (!folio_test_hugetlb(new)) + __lruvec_stat_add_folio(new, NR_FILE_PAGES); + if (folio_test_swapbacked(old)) + __lruvec_stat_sub_folio(old, NR_SHMEM); + if (folio_test_swapbacked(new)) + __lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) - free_folio(fold); - folio_put(fold); + free_folio(old); + folio_put(old); } -EXPORT_SYMBOL_GPL(replace_page_cache_page); +EXPORT_SYMBOL_GPL(replace_page_cache_folio); noinline int __filemap_add_folio(struct address_space *mapping, struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp) @@ -2048,10 +2049,10 @@ reset: * * Return: The number of entries which were found. */ -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, +unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { - XA_STATE(xas, &mapping->i_pages, start); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); @@ -2062,6 +2063,15 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, } rcu_read_unlock(); + if (folio_batch_count(fbatch)) { + unsigned long nr = 1; + int idx = folio_batch_count(fbatch) - 1; + + folio = fbatch->folios[idx]; + if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + nr = folio_nr_pages(folio); + *start = indices[idx] + nr; + } return folio_batch_count(fbatch); } @@ -2085,16 +2095,16 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t start, * * Return: The number of entries which were found. */ -unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, +unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices) { - XA_STATE(xas, &mapping->i_pages, start); + XA_STATE(xas, &mapping->i_pages, *start); struct folio *folio; rcu_read_lock(); while ((folio = find_get_entry(&xas, end, XA_PRESENT))) { if (!xa_is_value(folio)) { - if (folio->index < start) + if (folio->index < *start) goto put; if (folio->index + folio_nr_pages(folio) - 1 > end) goto put; @@ -2117,6 +2127,15 @@ put: } rcu_read_unlock(); + if (folio_batch_count(fbatch)) { + unsigned long nr = 1; + int idx = folio_batch_count(fbatch) - 1; + + folio = fbatch->folios[idx]; + if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + nr = folio_nr_pages(folio); + *start = indices[idx] + nr; + } return folio_batch_count(fbatch); } diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d7..69ed25790c68 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -39,12 +39,6 @@ void wait_for_stable_page(struct page *page) } EXPORT_SYMBOL_GPL(wait_for_stable_page); -bool page_mapped(struct page *page) -{ - return folio_mapped(page_folio(page)); -} -EXPORT_SYMBOL(page_mapped); - void mark_page_accessed(struct page *page) { folio_mark_accessed(page_folio(page)); @@ -82,12 +76,6 @@ bool redirty_page_for_writepage(struct writeback_control *wbc, } EXPORT_SYMBOL(redirty_page_for_writepage); -void lru_cache_add(struct page *page) -{ - folio_add_lru(page_folio(page)); -} -EXPORT_SYMBOL(lru_cache_add); - void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma) { @@ -108,7 +96,7 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, struct folio *folio; folio = __filemap_get_folio(mapping, index, fgp_flags, gfp); - if ((fgp_flags & FGP_HEAD) || !folio || xa_is_value(folio)) + if (!folio || xa_is_value(folio)) return &folio->page; return folio_file_page(folio, index); } @@ -124,17 +112,6 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, } EXPORT_SYMBOL(grab_cache_page_write_begin); -void delete_from_page_cache(struct page *page) -{ - return filemap_remove_folio(page_folio(page)); -} - -int try_to_release_page(struct page *page, gfp_t gfp) -{ - return filemap_release_folio(page_folio(page), gfp); -} -EXPORT_SYMBOL(try_to_release_page); - int isolate_lru_page(struct page *page) { if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page")) @@ -123,6 +123,9 @@ retry: */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags) { + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return NULL; + if (flags & FOLL_GET) return try_get_folio(page, refs); else if (flags & FOLL_PIN) { @@ -202,17 +205,22 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags) * time. Cases: please see the try_grab_folio() documentation, with * "refs=1". * - * Return: true for success, or if no action was required (if neither FOLL_PIN - * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or - * FOLL_PIN was set, but the page could not be grabbed. + * Return: 0 for success, or if no action was required (if neither FOLL_PIN + * nor FOLL_GET was set, nothing is done). A negative error code for failure: + * + * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not + * be grabbed. */ -bool __must_check try_grab_page(struct page *page, unsigned int flags) +int __must_check try_grab_page(struct page *page, unsigned int flags) { struct folio *folio = page_folio(page); WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN)); if (WARN_ON_ONCE(folio_ref_count(folio) <= 0)) - return false; + return -ENOMEM; + + if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page))) + return -EREMOTEIO; if (flags & FOLL_GET) folio_ref_inc(folio); @@ -232,7 +240,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags) node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1); } - return true; + return 0; } /** @@ -537,42 +545,13 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) return ERR_PTR(-EINVAL); - - /* - * Considering PTE level hugetlb, like continuous-PTE hugetlb on - * ARM64 architecture. - */ - if (is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - -retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto retry; - } + if (!pte_present(pte)) + goto no_page; if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; @@ -615,7 +594,7 @@ retry: } } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) { page = ERR_PTR(-EMLINK); goto out; } @@ -624,10 +603,12 @@ retry: !PageAnonExclusive(page), page); /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */ - if (unlikely(!try_grab_page(page, flags))) { - page = ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (unlikely(ret)) { + page = ERR_PTR(ret); goto out; } + /* * We need to make the page accessible if and only if we are going * to access its content (the FOLL_PIN case). Please see @@ -680,42 +661,8 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, pmdval = READ_ONCE(*pmd); if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pmd_pte(vma, address, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pmd_val(pmdval)))) { - page = follow_huge_pd(vma, address, - __hugepd(pmd_val(pmdval)), flags, - PMD_SHIFT); - if (page) - return page; + if (!pmd_present(pmdval)) return no_page_table(vma, flags); - } -retry: - if (!pmd_present(pmdval)) { - /* - * Should never reach here, if thp migration is not supported; - * Otherwise, it must be a thp migration entry. - */ - VM_BUG_ON(!thp_migration_supported() || - !is_pmd_migration_entry(pmdval)); - - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - - pmd_migration_entry_wait(mm, pmd); - pmdval = READ_ONCE(*pmd); - /* - * MADV_DONTNEED may convert the pmd to null because - * mmap_lock is held in read mode - */ - if (pmd_none(pmdval)) - return no_page_table(vma, flags); - goto retry; - } if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap); @@ -729,18 +676,10 @@ retry: if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); -retry_locked: ptl = pmd_lock(mm, pmd); - if (unlikely(pmd_none(*pmd))) { - spin_unlock(ptl); - return no_page_table(vma, flags); - } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); - if (likely(!(flags & FOLL_MIGRATION))) - return no_page_table(vma, flags); - pmd_migration_entry_wait(mm, pmd); - goto retry_locked; + return no_page_table(vma, flags); } if (unlikely(!pmd_trans_huge(*pmd))) { spin_unlock(ptl); @@ -783,20 +722,6 @@ static struct page *follow_pud_mask(struct vm_area_struct *vma, pud = pud_offset(p4dp, address); if (pud_none(*pud)) return no_page_table(vma, flags); - if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) { - page = follow_huge_pud(mm, address, pud, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pud_val(*pud)))) { - page = follow_huge_pd(vma, address, - __hugepd(pud_val(*pud)), flags, - PUD_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } if (pud_devmap(*pud)) { ptl = pud_lock(mm, pud); page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap); @@ -816,7 +741,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { p4d_t *p4d; - struct page *page; p4d = p4d_offset(pgdp, address); if (p4d_none(*p4d)) @@ -825,14 +749,6 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, if (unlikely(p4d_bad(*p4d))) return no_page_table(vma, flags); - if (is_hugepd(__hugepd(p4d_val(*p4d)))) { - page = follow_huge_pd(vma, address, - __hugepd(p4d_val(*p4d)), flags, - P4D_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } return follow_pud_mask(vma, address, p4d, flags, ctx); } @@ -870,10 +786,18 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, ctx->page_mask = 0; - /* make this handle hugepd */ - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN)); + /* + * Call hugetlb_follow_page_mask for hugetlb vmas as it will use + * special hugetlb page table walking code. This eliminates the + * need to check for hugetlb entries in the general walking code. + * + * hugetlb_follow_page_mask is only for follow_page() handling here. + * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. + */ + if (is_vm_hugetlb_page(vma)) { + page = hugetlb_follow_page_mask(vma, address, flags); + if (!page) + page = no_page_table(vma, flags); return page; } @@ -882,21 +806,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) return no_page_table(vma, flags); - if (pgd_huge(*pgd)) { - page = follow_huge_pgd(mm, address, pgd, flags); - if (page) - return page; - return no_page_table(vma, flags); - } - if (is_hugepd(__hugepd(pgd_val(*pgd)))) { - page = follow_huge_pd(vma, address, - __hugepd(pgd_val(*pgd)), flags, - PGDIR_SHIFT); - if (page) - return page; - return no_page_table(vma, flags); - } - return follow_p4d_mask(vma, address, pgd, flags, ctx); } @@ -960,10 +869,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address, goto unmap; *page = pte_page(*pte); } - if (unlikely(!try_grab_page(*page, gup_flags))) { - ret = -ENOMEM; + ret = try_grab_page(*page, gup_flags); + if (unlikely(ret)) goto unmap; - } out: ret = 0; unmap: @@ -989,8 +897,17 @@ static int faultin_page(struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; if (*flags & FOLL_REMOTE) fault_flags |= FAULT_FLAG_REMOTE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + /* + * FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set + * FOLL_INTERRUPTIBLE to enable FAULT_FLAG_INTERRUPTIBLE. + * That's because some callers may not be prepared to + * handle early exits caused by non-fatal signals. + */ + if (*flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (*flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; if (*flags & FOLL_TRIED) { @@ -1058,6 +975,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA)) + return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) return -EFAULT; @@ -1065,6 +985,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) return -EFAULT; + /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */ + if (is_vm_hugetlb_page(vma)) + return -EFAULT; /* * We used to let the write,force case do COW in a * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could @@ -1392,6 +1315,22 @@ retry: EXPORT_SYMBOL_GPL(fixup_user_fault); /* + * GUP always responds to fatal signals. When FOLL_INTERRUPTIBLE is + * specified, it'll also respond to generic signals. The caller of GUP + * that has FOLL_INTERRUPTIBLE should take care of the GUP interruption. + */ +static bool gup_signal_pending(unsigned int flags) +{ + if (fatal_signal_pending(current)) + return true; + + if (!(flags & FOLL_INTERRUPTIBLE)) + return false; + + return signal_pending(current); +} + +/* * Please note that this function, unlike __get_user_pages will not * return 0 for nr_pages > 0 without FOLL_NOWAIT */ @@ -1472,11 +1411,11 @@ retry: * Repeat on the address that fired VM_FAULT_RETRY * with both FAULT_FLAG_ALLOW_RETRY and * FAULT_FLAG_TRIED. Note that GUP can be interrupted - * by fatal signals, so we need to check it before we + * by fatal signals of even common signals, depending on + * the caller's request. So we need to check it before we * start trying again otherwise it can loop forever. */ - - if (fatal_signal_pending(current)) { + if (gup_signal_pending(flags)) { if (!pages_done) pages_done = -EINTR; break; @@ -2105,14 +2044,19 @@ static long __gup_longterm_locked(struct mm_struct *mm, unsigned long nr_pages, struct page **pages, struct vm_area_struct **vmas, + int *locked, unsigned int gup_flags) { + bool must_unlock = false; unsigned int flags; long rc, nr_pinned_pages; + if (locked && WARN_ON_ONCE(!*locked)) + return -EINVAL; + if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - NULL, gup_flags); + locked, gup_flags); /* * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM @@ -2126,8 +2070,13 @@ static long __gup_longterm_locked(struct mm_struct *mm, return -EINVAL; flags = memalloc_pin_save(); do { + if (locked && !*locked) { + mmap_read_lock(mm); + must_unlock = true; + *locked = 1; + } nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, - pages, vmas, NULL, + pages, vmas, locked, gup_flags); if (nr_pinned_pages <= 0) { rc = nr_pinned_pages; @@ -2137,6 +2086,10 @@ static long __gup_longterm_locked(struct mm_struct *mm, } while (rc == -EAGAIN); memalloc_pin_restore(flags); + if (locked && *locked && must_unlock) { + mmap_read_unlock(mm); + *locked = 0; + } return rc ? rc : nr_pinned_pages; } @@ -2160,35 +2113,6 @@ static bool is_valid_gup_flags(unsigned int gup_flags) } #ifdef CONFIG_MMU -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - /* - * Parts of FOLL_LONGTERM behavior are incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. However, this only comes up if locked is set, and there are - * callers that do request FOLL_LONGTERM, but do not set locked. So, - * allow what we can. - */ - if (gup_flags & FOLL_LONGTERM) { - if (WARN_ON_ONCE(locked)) - return -EINVAL; - /* - * This will check the vmas (even if our vmas arg is NULL) - * and return -ENOTSUPP if DAX isn't allowed in this case: - */ - return __gup_longterm_locked(mm, start, nr_pages, pages, - vmas, gup_flags | FOLL_TOUCH | - FOLL_REMOTE); - } - - return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, - locked, - gup_flags | FOLL_TOUCH | FOLL_REMOTE); -} - /** * get_user_pages_remote() - pin user pages in memory * @mm: mm_struct of target mm @@ -2257,8 +2181,8 @@ long get_user_pages_remote(struct mm_struct *mm, if (!is_valid_gup_flags(gup_flags)) return -EINVAL; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -2270,14 +2194,6 @@ long get_user_pages_remote(struct mm_struct *mm, { return 0; } - -static long __get_user_pages_remote(struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *locked) -{ - return 0; -} #endif /* !CONFIG_MMU */ /** @@ -2304,7 +2220,7 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, return -EINVAL; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags | FOLL_TOUCH); + pages, vmas, NULL, gup_flags | FOLL_TOUCH); } EXPORT_SYMBOL(get_user_pages); @@ -2330,18 +2246,9 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int locked = 1; long ret; - /* - * FIXME: Current FOLL_LONGTERM behavior is incompatible with - * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on - * vmas. As there are no users of this flag in this call we simply - * disallow this option for now. - */ - if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) - return -EINVAL; - mmap_read_lock(mm); - ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL, - &locked, gup_flags | FOLL_TOUCH); + ret = __gup_longterm_locked(mm, start, nr_pages, pages, NULL, &locked, + gup_flags | FOLL_TOUCH); if (locked) mmap_read_unlock(mm); return ret; @@ -2468,7 +2375,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, goto pte_unmap; } - if (!pte_write(pte) && gup_must_unshare(flags, page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, page)) { gup_put_folio(folio, 1, flags); goto pte_unmap; } @@ -2534,9 +2441,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, undo_dev_pagemap(nr, nr_start, flags, pages); break; } + + if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) { + undo_dev_pagemap(nr, nr_start, flags, pages); + break; + } + SetPageReferenced(page); pages[*nr] = page; - if (unlikely(!try_grab_page(page, flags))) { + if (unlikely(try_grab_page(page, flags))) { undo_dev_pagemap(nr, nr_start, flags, pages); break; } @@ -2654,7 +2567,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) { + if (!pte_write(pte) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2720,7 +2633,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pmd_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2760,7 +2673,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) { + if (!pud_write(orig) && gup_must_unshare(NULL, flags, &folio->page)) { gup_put_folio(folio, refs, flags); return 0; } @@ -2808,7 +2721,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo pmdp = pmd_offset_lockless(pudp, pud, addr); do { - pmd_t pmd = READ_ONCE(*pmdp); + pmd_t pmd = pmdp_get_lockless(pmdp); next = pmd_addr_end(addr, end); if (!pmd_present(pmd)) @@ -2852,7 +2765,7 @@ static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned lo next = pud_addr_end(addr, end); if (unlikely(!pud_present(pud))) return 0; - if (unlikely(pud_huge(pud))) { + if (unlikely(pud_huge(pud) || pud_devmap(pud))) { if (!gup_huge_pud(pud, pudp, addr, next, flags, pages, nr)) return 0; @@ -2935,29 +2848,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end) } #endif -static int __gup_longterm_unlocked(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) -{ - int ret; - - /* - * FIXME: FOLL_LONGTERM does not work with - * get_user_pages_unlocked() (see comments in that function) - */ - if (gup_flags & FOLL_LONGTERM) { - mmap_read_lock(current->mm); - ret = __gup_longterm_locked(current->mm, - start, nr_pages, - pages, NULL, gup_flags); - mmap_read_unlock(current->mm); - } else { - ret = get_user_pages_unlocked(start, nr_pages, - pages, gup_flags); - } - - return ret; -} - static unsigned long lockless_pages_from_mm(unsigned long start, unsigned long end, unsigned int gup_flags, @@ -3018,7 +2908,8 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | - FOLL_FAST_ONLY | FOLL_NOFAULT))) + FOLL_FAST_ONLY | FOLL_NOFAULT | + FOLL_PCI_P2PDMA))) return -EINVAL; if (gup_flags & FOLL_PIN) @@ -3041,8 +2932,8 @@ static int internal_get_user_pages_fast(unsigned long start, /* Slow path: try to get the remaining pages with get_user_pages */ start += nr_pinned << PAGE_SHIFT; pages += nr_pinned; - ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags, - pages); + ret = get_user_pages_unlocked(start, nr_pages - nr_pinned, pages, + gup_flags); if (ret < 0) { /* * The caller has to unpin the pages we already pinned so @@ -3241,9 +3132,9 @@ long pin_user_pages_remote(struct mm_struct *mm, if (WARN_ON_ONCE(!pages)) return -EINVAL; - gup_flags |= FOLL_PIN; - return __get_user_pages_remote(mm, start, nr_pages, gup_flags, - pages, vmas, locked); + return __gup_longterm_locked(mm, start, nr_pages, pages, vmas, locked, + gup_flags | FOLL_PIN | FOLL_TOUCH | + FOLL_REMOTE); } EXPORT_SYMBOL(pin_user_pages_remote); @@ -3277,7 +3168,7 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, gup_flags |= FOLL_PIN; return __gup_longterm_locked(current->mm, start, nr_pages, - pages, vmas, gup_flags); + pages, vmas, NULL, gup_flags); } EXPORT_SYMBOL(pin_user_pages); diff --git a/mm/gup_test.c b/mm/gup_test.c index 12b0a91767d3..8ae7307a1bb6 100644 --- a/mm/gup_test.c +++ b/mm/gup_test.c @@ -4,6 +4,7 @@ #include <linux/uaccess.h> #include <linux/ktime.h> #include <linux/debugfs.h> +#include <linux/highmem.h> #include "gup_test.h" static void put_back_pages(unsigned int cmd, struct page **pages, @@ -203,6 +204,138 @@ free_pages: return ret; } +static DEFINE_MUTEX(pin_longterm_test_mutex); +static struct page **pin_longterm_test_pages; +static unsigned long pin_longterm_test_nr_pages; + +static inline void pin_longterm_test_stop(void) +{ + if (pin_longterm_test_pages) { + if (pin_longterm_test_nr_pages) + unpin_user_pages(pin_longterm_test_pages, + pin_longterm_test_nr_pages); + kvfree(pin_longterm_test_pages); + pin_longterm_test_pages = NULL; + pin_longterm_test_nr_pages = 0; + } +} + +static inline int pin_longterm_test_start(unsigned long arg) +{ + long nr_pages, cur_pages, addr, remaining_pages; + int gup_flags = FOLL_LONGTERM; + struct pin_longterm_test args; + struct page **pages; + int ret = 0; + bool fast; + + if (pin_longterm_test_pages) + return -EINVAL; + + if (copy_from_user(&args, (void __user *)arg, sizeof(args))) + return -EFAULT; + + if (args.flags & + ~(PIN_LONGTERM_TEST_FLAG_USE_WRITE|PIN_LONGTERM_TEST_FLAG_USE_FAST)) + return -EINVAL; + if (!IS_ALIGNED(args.addr | args.size, PAGE_SIZE)) + return -EINVAL; + if (args.size > LONG_MAX) + return -EINVAL; + nr_pages = args.size / PAGE_SIZE; + if (!nr_pages) + return -EINVAL; + + pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + if (args.flags & PIN_LONGTERM_TEST_FLAG_USE_WRITE) + gup_flags |= FOLL_WRITE; + fast = !!(args.flags & PIN_LONGTERM_TEST_FLAG_USE_FAST); + + if (!fast && mmap_read_lock_killable(current->mm)) { + kvfree(pages); + return -EINTR; + } + + pin_longterm_test_pages = pages; + pin_longterm_test_nr_pages = 0; + + while (nr_pages - pin_longterm_test_nr_pages) { + remaining_pages = nr_pages - pin_longterm_test_nr_pages; + addr = args.addr + pin_longterm_test_nr_pages * PAGE_SIZE; + + if (fast) + cur_pages = pin_user_pages_fast(addr, remaining_pages, + gup_flags, pages); + else + cur_pages = pin_user_pages(addr, remaining_pages, + gup_flags, pages, NULL); + if (cur_pages < 0) { + pin_longterm_test_stop(); + ret = cur_pages; + break; + } + pin_longterm_test_nr_pages += cur_pages; + pages += cur_pages; + } + + if (!fast) + mmap_read_unlock(current->mm); + return ret; +} + +static inline int pin_longterm_test_read(unsigned long arg) +{ + __u64 user_addr; + unsigned long i; + + if (!pin_longterm_test_pages) + return -EINVAL; + + if (copy_from_user(&user_addr, (void __user *)arg, sizeof(user_addr))) + return -EFAULT; + + for (i = 0; i < pin_longterm_test_nr_pages; i++) { + void *addr = kmap_local_page(pin_longterm_test_pages[i]); + unsigned long ret; + + ret = copy_to_user((void __user *)(unsigned long)user_addr, addr, + PAGE_SIZE); + kunmap_local(addr); + if (ret) + return -EFAULT; + user_addr += PAGE_SIZE; + } + return 0; +} + +static long pin_longterm_test_ioctl(struct file *filep, unsigned int cmd, + unsigned long arg) +{ + int ret = -EINVAL; + + if (mutex_lock_killable(&pin_longterm_test_mutex)) + return -EINTR; + + switch (cmd) { + case PIN_LONGTERM_TEST_START: + ret = pin_longterm_test_start(arg); + break; + case PIN_LONGTERM_TEST_STOP: + pin_longterm_test_stop(); + ret = 0; + break; + case PIN_LONGTERM_TEST_READ: + ret = pin_longterm_test_read(arg); + break; + } + + mutex_unlock(&pin_longterm_test_mutex); + return ret; +} + static long gup_test_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) { @@ -217,6 +350,10 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd, case PIN_BASIC_TEST: case DUMP_USER_PAGES_TEST: break; + case PIN_LONGTERM_TEST_START: + case PIN_LONGTERM_TEST_STOP: + case PIN_LONGTERM_TEST_READ: + return pin_longterm_test_ioctl(filep, cmd, arg); default: return -EINVAL; } @@ -234,9 +371,17 @@ static long gup_test_ioctl(struct file *filep, unsigned int cmd, return 0; } +static int gup_test_release(struct inode *inode, struct file *file) +{ + pin_longterm_test_stop(); + + return 0; +} + static const struct file_operations gup_test_fops = { .open = nonseekable_open, .unlocked_ioctl = gup_test_ioctl, + .release = gup_test_release, }; static int __init gup_test_init(void) diff --git a/mm/gup_test.h b/mm/gup_test.h index 887ac1d5f5bc..5b37b54e8bea 100644 --- a/mm/gup_test.h +++ b/mm/gup_test.h @@ -10,6 +10,9 @@ #define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test) #define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test) #define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test) +#define PIN_LONGTERM_TEST_START _IOW('g', 7, struct pin_longterm_test) +#define PIN_LONGTERM_TEST_STOP _IO('g', 8) +#define PIN_LONGTERM_TEST_READ _IOW('g', 9, __u64) #define GUP_TEST_MAX_PAGES_TO_DUMP 8 @@ -30,4 +33,13 @@ struct gup_test { __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP]; }; +#define PIN_LONGTERM_TEST_FLAG_USE_WRITE 1 +#define PIN_LONGTERM_TEST_FLAG_USE_FAST 2 + +struct pin_longterm_test { + __u64 addr; + __u64 size; + __u32 flags; +}; + #endif /* __GUP_TEST_H */ @@ -361,8 +361,7 @@ again: * huge or device mapping one and compute corresponding pfn * values. */ - pmd = pmd_read_atomic(pmdp); - barrier(); + pmd = pmdp_get_lockless(pmdp); if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) goto again; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1cc4a5f4791e..abe6cfd92ffa 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1035,6 +1035,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pmd_pfn(*pmd); struct mm_struct *mm = vma->vm_mm; struct page *page; + int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -1066,8 +1067,9 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - if (!try_grab_page(page, flags)) - page = ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (ret) + page = ERR_PTR(ret); return page; } @@ -1193,6 +1195,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pud_pfn(*pud); struct mm_struct *mm = vma->vm_mm; struct page *page; + int ret; assert_spin_locked(pud_lockptr(mm, pud)); @@ -1226,8 +1229,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, if (!*pgmap) return ERR_PTR(-EFAULT); page = pfn_to_page(pfn); - if (!try_grab_page(page, flags)) - page = ERR_PTR(-ENOMEM); + + ret = try_grab_page(page, flags); + if (ret) + page = ERR_PTR(ret); return page; } @@ -1313,9 +1318,6 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); - if (is_huge_zero_pmd(orig_pmd)) goto fallback; @@ -1379,7 +1381,7 @@ reuse: if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); - return VM_FAULT_WRITE; + return 0; } unlock_fallback: @@ -1390,6 +1392,36 @@ fallback: return VM_FAULT_FALLBACK; } +static inline bool can_change_pmd_writable(struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + struct page *page; + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; + + /* Don't touch entries that are not even readable (NUMA hinting). */ + if (pmd_protnone(pmd)) + return false; + + /* Do we need write faults for softdirty tracking? */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + + /* Do we need write faults for uffd-wp tracking? */ + if (userfaultfd_huge_pmd_wp(vma, pmd)) + return false; + + if (!(vma->vm_flags & VM_SHARED)) { + /* See can_change_pte_writable(). */ + page = vm_normal_page_pmd(vma, addr, pmd); + return page && PageAnon(page) && PageAnonExclusive(page); + } + + /* See can_change_pte_writable(). */ + return pmd_dirty(pmd); +} + /* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, struct vm_area_struct *vma, @@ -1435,6 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; struct page *page; + int ret; assert_spin_locked(pmd_lockptr(mm, pmd)); @@ -1453,14 +1486,15 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; - if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) return ERR_PTR(-EMLINK); VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && !PageAnonExclusive(page), page); - if (!try_grab_page(page, flags)) - return ERR_PTR(-ENOMEM); + ret = try_grab_page(page, flags); + if (ret) + return ERR_PTR(ret); if (flags & FOLL_TOUCH) touch_pmd(vma, addr, pmd, flags & FOLL_WRITE); @@ -1481,8 +1515,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = NUMA_NO_NODE; int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK); - bool migrated = false; - bool was_writable = pmd_savedwrite(oldpmd); + bool migrated = false, writable = false; int flags = 0; vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); @@ -1492,12 +1525,22 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } pmd = pmd_modify(oldpmd, vma->vm_page_prot); + + /* + * Detect now whether the PMD could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pmd_write(pmd); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pmd_writable(vma, vmf->address, pmd)) + writable = true; + page = vm_normal_page_pmd(vma, haddr, pmd); if (!page) goto out_map; /* See similar comment in do_numa_page for explanation */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; page_nid = page_to_nid(page); @@ -1516,6 +1559,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) } spin_unlock(vmf->ptl); + writable = false; migrated = migrate_misplaced_page(page, vma, target_nid); if (migrated) { @@ -1542,7 +1586,7 @@ out_map: /* Restore the PMD */ pmd = pmd_modify(oldpmd, vma->vm_page_prot); pmd = pmd_mkyoung(pmd); - if (was_writable) + if (writable) pmd = pmd_mkwrite(pmd); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); @@ -1783,11 +1827,10 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; pmd_t oldpmd, entry; - bool preserve_write; - int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + int ret = 1; tlb_change_page_size(tlb, HPAGE_PMD_SIZE); @@ -1798,9 +1841,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - preserve_write = prot_numa && pmd_write(*pmd); - ret = 1; - #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -1880,8 +1920,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, oldpmd = pmdp_invalidate_ad(vma, addr, pmd); entry = pmd_modify(oldpmd, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); if (uffd_wp) { entry = pmd_wrprotect(entry); entry = pmd_mkuffd_wp(entry); @@ -1893,13 +1931,17 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, */ entry = pmd_clear_uffd_wp(entry); } + + /* See change_pte_range(). */ + if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) && !pmd_write(entry) && + can_change_pmd_writable(vma, addr, entry)) + entry = pmd_mkwrite(entry); + ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); if (huge_pmd_needs_flush(oldpmd, entry)) tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); - - BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); return ret; @@ -2141,7 +2183,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, uffd_wp = pmd_uffd_wp(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); - page_ref_add(page, HPAGE_PMD_NR - 1); /* * Without "freeze", we'll simply split the PMD, propagating the @@ -2161,6 +2202,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, anon_exclusive = PageAnon(page) && PageAnonExclusive(page); if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) freeze = false; + if (!freeze) + page_ref_add(page, HPAGE_PMD_NR - 1); } /* @@ -2202,63 +2245,37 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = maybe_mkwrite(entry, vma); if (anon_exclusive) SetPageAnonExclusive(page + i); - if (!write) - entry = pte_wrprotect(entry); if (!young) entry = pte_mkold(entry); /* NOTE: this may set soft-dirty too on some archs */ if (dirty) entry = pte_mkdirty(entry); + /* + * NOTE: this needs to happen after pte_mkdirty, + * because some archs (sparc64, loongarch) could + * set hw write bit when mkdirty. + */ + if (!write) + entry = pte_wrprotect(entry); if (soft_dirty) entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); + page_add_anon_rmap(page + i, vma, addr, false); } pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); - if (!pmd_migration) - atomic_inc(&page[i]._mapcount); pte_unmap(pte); } - if (!pmd_migration) { - /* - * Set PG_double_map before dropping compound_mapcount to avoid - * false-negative page_mapped(). - */ - if (compound_mapcount(page) > 1 && - !TestSetPageDoubleMap(page)) { - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_inc(&page[i]._mapcount); - } - - lock_page_memcg(page); - if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { - /* Last compound_mapcount is gone. */ - __mod_lruvec_page_state(page, NR_ANON_THPS, - -HPAGE_PMD_NR); - if (TestClearPageDoubleMap(page)) { - /* No need in mapcount reference anymore */ - for (i = 0; i < HPAGE_PMD_NR; i++) - atomic_dec(&page[i]._mapcount); - } - } - unlock_page_memcg(page); - - /* Above is effectively page_remove_rmap(page, vma, true) */ - munlock_vma_page(page, vma, true); - } + if (!pmd_migration) + page_remove_rmap(page, vma, true); + if (freeze) + put_page(page); smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - - if (freeze) { - for (i = 0; i < HPAGE_PMD_NR; i++) { - page_remove_rmap(page + i, vma, false); - put_page(page + i); - } - } } void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, @@ -2444,18 +2461,32 @@ static void __split_huge_page_tail(struct page *head, int tail, (1L << PG_workingset) | (1L << PG_locked) | (1L << PG_unevictable) | -#ifdef CONFIG_64BIT +#ifdef CONFIG_ARCH_USES_PG_ARCH_X (1L << PG_arch_2) | + (1L << PG_arch_3) | #endif (1L << PG_dirty) | LRU_GEN_MASK | LRU_REFS_MASK)); - /* ->mapping in first tail page is compound_mapcount */ + /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; page_tail->index = head->index + tail; - page_tail->private = 0; + + /* + * page->private should not be set in tail pages with the exception + * of swap cache pages that store the swp_entry_t in tail pages. + * Fix up and warn once if private is unexpectedly set. + * + * What of 32-bit systems, on which head[1].compound_pincount overlays + * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and + * compound_pincount must be 0 for folio_ref_freeze() to have succeeded. + */ + if (!folio_test_swapcache(page_folio(head))) { + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + page_tail->private = 0; + } /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2703,7 +2734,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { - ret = -EBUSY; + ret = -EAGAIN; goto out_unlock; } @@ -2753,7 +2784,7 @@ fail: xas_unlock(&xas); local_irq_enable(); remap_page(folio, folio_nr_pages(folio)); - ret = -EBUSY; + ret = -EAGAIN; } out_unlock: @@ -3057,28 +3088,28 @@ static int split_huge_pages_in_file(const char *file_path, pgoff_t off_start, mapping = candidate->f_mapping; for (index = off_start; index < off_end; index += nr_pages) { - struct page *fpage = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, + FGP_ENTRY, 0); nr_pages = 1; - if (xa_is_value(fpage) || !fpage) + if (xa_is_value(folio) || !folio) continue; - if (!is_transparent_hugepage(fpage)) + if (!folio_test_large(folio)) goto next; total++; - nr_pages = thp_nr_pages(fpage); + nr_pages = folio_nr_pages(folio); - if (!trylock_page(fpage)) + if (!folio_trylock(folio)) goto next; - if (!split_huge_page(fpage)) + if (!split_folio(folio)) split++; - unlock_page(fpage); + folio_unlock(folio); next: - put_page(fpage); + folio_put(folio); cond_resched(); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b586cdd75930..db895230ee7e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -54,13 +54,13 @@ struct hstate hstates[HUGE_MAX_HSTATE]; #ifdef CONFIG_CMA static struct cma *hugetlb_cma[MAX_NUMNODES]; static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata; -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { - return cma_pages_valid(hugetlb_cma[page_to_nid(page)], page, + return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page, 1 << order); } #else -static bool hugetlb_cma_page(struct page *page, unsigned int order) +static bool hugetlb_cma_folio(struct folio *folio, unsigned int order) { return false; } @@ -255,6 +255,152 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) return subpool_inode(file_inode(vma->vm_file)); } +/* + * hugetlb vma_lock helper routines + */ +static bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (!__vma_shareable_lock(vma)) + return 1; + + return down_write_trylock(&vma_lock->rw_sema); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +{ + struct vm_area_struct *vma = vma_lock->vma; + + /* + * vma_lock structure may or not be released as a result of put, + * it certainly will no longer be attached to vma so clear pointer. + * Semaphore synchronizes access to vma_lock->vma field. + */ + vma_lock->vma = NULL; + vma->vm_private_data = NULL; + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); +} + +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ + if (__vma_shareable_lock(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. + */ + if (!vma || !__vma_shareable_lock(vma)) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) { + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. This is only a possible + * performance enhancement and memory saving issue. + * However, the lock is also used to synchronize page + * faults with truncation. If the lock is not present, + * unlikely races could leave pages in a file past i_size + * until the file is removed. Warn in the unlikely case of + * allocation failure. + */ + pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); + return; + } + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + /* Helper that removes a struct file_region from the resv_map cache and returns * it for use. */ @@ -1014,15 +1160,23 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma) VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); /* * Clear vm_private_data + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + * Before clearing, make sure pointer is not associated with vma + * as this will leak the structure. This is the case when called + * via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already + * been called to allocate a new structure. * - For MAP_PRIVATE mappings, this is the reserve map which does * not apply to children. Faults generated by the children are * not guaranteed to succeed, even if read-only. - * - For shared mappings this is a per-vma semaphore that may be - * allocated in a subsequent call to hugetlb_vm_op_open. */ - vma->vm_private_data = (void *)0; - if (!(vma->vm_flags & VM_MAYSHARE)) - return; + if (vma->vm_flags & VM_MAYSHARE) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock && vma_lock->vma != vma) + vma->vm_private_data = NULL; + } else + vma->vm_private_data = NULL; } /* @@ -1119,17 +1273,17 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) return false; } -static void enqueue_huge_page(struct hstate *h, struct page *page) +static void enqueue_hugetlb_folio(struct hstate *h, struct folio *folio) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); lockdep_assert_held(&hugetlb_lock); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); - list_move(&page->lru, &h->hugepage_freelists[nid]); + list_move(&folio->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; - SetHPageFreed(page); + folio_set_hugetlb_freed(folio); } static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid) @@ -1317,76 +1471,76 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) nr_nodes--) /* used to demote non-gigantic_huge pages as well */ -static void __destroy_compound_gigantic_page(struct page *page, +static void __destroy_compound_gigantic_folio(struct folio *folio, unsigned int order, bool demote) { int i; int nr_pages = 1 << order; struct page *p; - atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), 0); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); for (i = 1; i < nr_pages; i++) { - p = nth_page(page, i); + p = folio_page(folio, i); p->mapping = NULL; clear_compound_head(p); if (!demote) set_page_refcounted(p); } - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); } -static void destroy_compound_hugetlb_page_for_demote(struct page *page, +static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, true); + __destroy_compound_gigantic_folio(folio, order, true); } #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE -static void destroy_compound_gigantic_page(struct page *page, +static void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { - __destroy_compound_gigantic_page(page, order, false); + __destroy_compound_gigantic_folio(folio, order, false); } -static void free_gigantic_page(struct page *page, unsigned int order) +static void free_gigantic_folio(struct folio *folio, unsigned int order) { /* * If the page isn't allocated using the cma allocator, * cma_release() returns false. */ #ifdef CONFIG_CMA - if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order)) + int nid = folio_nid(folio); + + if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order)) return; #endif - free_contig_range(page_to_pfn(page), 1 << order); + free_contig_range(folio_pfn(folio), 1 << order); } #ifdef CONFIG_CONTIG_ALLOC -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { + struct page *page; unsigned long nr_pages = pages_per_huge_page(h); if (nid == NUMA_NO_NODE) nid = numa_mem_id(); #ifdef CONFIG_CMA { - struct page *page; int node; if (hugetlb_cma[nid]) { page = cma_alloc(hugetlb_cma[nid], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } if (!(gfp_mask & __GFP_THISNODE)) { @@ -1397,17 +1551,18 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, page = cma_alloc(hugetlb_cma[node], nr_pages, huge_page_order(h), true); if (page) - return page; + return page_folio(page); } } } #endif - return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask); + return page ? page_folio(page) : NULL; } #else /* !CONFIG_CONTIG_ALLOC */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; @@ -1415,40 +1570,41 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, #endif /* CONFIG_CONTIG_ALLOC */ #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ -static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, +static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nodemask) { return NULL; } -static inline void free_gigantic_page(struct page *page, unsigned int order) { } -static inline void destroy_compound_gigantic_page(struct page *page, +static inline void free_gigantic_folio(struct folio *folio, + unsigned int order) { } +static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif /* - * Remove hugetlb page from lists, and update dtor so that page appears + * Remove hugetlb folio from lists, and update dtor so that the folio appears * as just a compound page. * - * A reference is held on the page, except in the case of demote. + * A reference is held on the folio, except in the case of demote. * * Must be called with hugetlb lock held. */ -static void __remove_hugetlb_page(struct hstate *h, struct page *page, +static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus, bool demote) { - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); - VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio(folio), folio); + VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; - list_del(&page->lru); + list_del(&folio->lru); - if (HPageFreed(page)) { + if (folio_test_hugetlb_freed(folio)) { h->free_huge_pages--; h->free_huge_pages_node[nid]--; } @@ -1467,50 +1623,50 @@ static void __remove_hugetlb_page(struct hstate *h, struct page *page, * * For gigantic pages set the destructor to the null dtor. This * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_page will turn the compound page - * into a simple group of pages. After this the destructor does not + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not * apply. * * This handles the case where more than one ref is held when and - * after update_and_free_page is called. + * after update_and_free_hugetlb_folio is called. * * In the case of demote we do not ref count the page as it will soon * be turned into a page of smaller size. */ if (!demote) - set_page_refcounted(page); + folio_ref_unfreeze(folio, 1); if (hstate_is_gigantic(h)) - set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); else - set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; } -static void remove_hugetlb_page(struct hstate *h, struct page *page, +static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, false); + __remove_hugetlb_folio(h, folio, adjust_surplus, false); } -static void remove_hugetlb_page_for_demote(struct hstate *h, struct page *page, +static void remove_hugetlb_folio_for_demote(struct hstate *h, struct folio *folio, bool adjust_surplus) { - __remove_hugetlb_page(h, page, adjust_surplus, true); + __remove_hugetlb_folio(h, folio, adjust_surplus, true); } -static void add_hugetlb_page(struct hstate *h, struct page *page, +static void add_hugetlb_folio(struct hstate *h, struct folio *folio, bool adjust_surplus) { int zeroed; - int nid = page_to_nid(page); + int nid = folio_nid(folio); - VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page); + VM_BUG_ON_FOLIO(!folio_test_hugetlb_vmemmap_optimized(folio), folio); lockdep_assert_held(&hugetlb_lock); - INIT_LIST_HEAD(&page->lru); + INIT_LIST_HEAD(&folio->lru); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; @@ -1519,21 +1675,21 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, h->surplus_huge_pages_node[nid]++; } - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_page_private(page, 0); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_change_private(folio, NULL); /* - * We have to set HPageVmemmapOptimized again as above - * set_page_private(page, 0) cleared it. + * We have to set hugetlb_vmemmap_optimized again as above + * folio_change_private(folio, NULL) cleared it. */ - SetHPageVmemmapOptimized(page); + folio_set_hugetlb_vmemmap_optimized(folio); /* - * This page is about to be managed by the hugetlb allocator and + * This folio is about to be managed by the hugetlb allocator and * should have no users. Drop our reference, and check for others * just in case. */ - zeroed = put_page_testzero(page); - if (!zeroed) + zeroed = folio_put_testzero(folio); + if (unlikely(!zeroed)) /* * It is VERY unlikely soneone else has taken a ref on * the page. In this case, we simply return as the @@ -1542,13 +1698,14 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, */ return; - arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + arch_clear_hugepage_flags(&folio->page); + enqueue_hugetlb_folio(h, folio); } static void __update_and_free_page(struct hstate *h, struct page *page) { int i; + struct folio *folio = page_folio(page); struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1558,7 +1715,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * If we don't know which subpages are hwpoisoned, we can't free * the hugepage, so it's leaked intentionally. */ - if (HPageRawHwpUnreliable(page)) + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; if (hugetlb_vmemmap_restore(h, page)) { @@ -1568,7 +1725,7 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * page and put the page back on the hugetlb free list and treat * as a surplus page. */ - add_hugetlb_page(h, page, true); + add_hugetlb_folio(h, folio, true); spin_unlock_irq(&hugetlb_lock); return; } @@ -1577,11 +1734,11 @@ static void __update_and_free_page(struct hstate *h, struct page *page) * Move PageHWPoison flag from head page to the raw error pages, * which makes any healthy subpages reusable. */ - if (unlikely(PageHWPoison(page))) - hugetlb_clear_page_hwpoison(page); + if (unlikely(folio_test_hwpoison(folio))) + hugetlb_clear_page_hwpoison(&folio->page); for (i = 0; i < pages_per_huge_page(h); i++) { - subpage = nth_page(page, i); + subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1590,19 +1747,19 @@ static void __update_and_free_page(struct hstate *h, struct page *page) /* * Non-gigantic pages demoted from CMA allocated gigantic pages - * need to be given back to CMA in free_gigantic_page. + * need to be given back to CMA in free_gigantic_folio. */ if (hstate_is_gigantic(h) || - hugetlb_cma_page(page, huge_page_order(h))) { - destroy_compound_gigantic_page(page, huge_page_order(h)); - free_gigantic_page(page, huge_page_order(h)); + hugetlb_cma_folio(folio, huge_page_order(h))) { + destroy_compound_gigantic_folio(folio, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } else { __free_pages(page, huge_page_order(h)); } } /* - * As update_and_free_page() can be called under any context, so we cannot + * As update_and_free_hugetlb_folio() can be called under any context, so we cannot * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate * the vmemmap pages. @@ -1631,8 +1788,9 @@ static void free_hpage_workfn(struct work_struct *work) /* * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() * is going to trigger because a previous call to - * remove_hugetlb_page() will set_compound_page_dtor(page, - * NULL_COMPOUND_DTOR), so do not use page_hstate() directly. + * remove_hugetlb_folio() will call folio_set_compound_dtor + * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate() + * directly. */ h = size_to_hstate(page_size(page)); @@ -1649,11 +1807,11 @@ static inline void flush_free_hpage_work(struct hstate *h) flush_work(&free_hpage_work); } -static void update_and_free_page(struct hstate *h, struct page *page, +static void update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio, bool atomic) { - if (!HPageVmemmapOptimized(page) || !atomic) { - __update_and_free_page(h, page); + if (!folio_test_hugetlb_vmemmap_optimized(folio) || !atomic) { + __update_and_free_page(h, &folio->page); return; } @@ -1664,16 +1822,18 @@ static void update_and_free_page(struct hstate *h, struct page *page, * empty. Otherwise, schedule_work() had been called but the workfn * hasn't retrieved the list yet. */ - if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist)) + if (llist_add((struct llist_node *)&folio->mapping, &hpage_freelist)) schedule_work(&free_hpage_work); } static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list) { struct page *page, *t_page; + struct folio *folio; list_for_each_entry_safe(page, t_page, list, lru) { - update_and_free_page(h, page, false); + folio = page_folio(page); + update_and_free_hugetlb_folio(h, folio, false); cond_resched(); } } @@ -1695,21 +1855,22 @@ void free_huge_page(struct page *page) * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct hstate *h = page_hstate(page); - int nid = page_to_nid(page); - struct hugepage_subpool *spool = hugetlb_page_subpool(page); + struct folio *folio = page_folio(page); + struct hstate *h = folio_hstate(folio); + int nid = folio_nid(folio); + struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); bool restore_reserve; unsigned long flags; - VM_BUG_ON_PAGE(page_count(page), page); - VM_BUG_ON_PAGE(page_mapcount(page), page); + VM_BUG_ON_FOLIO(folio_ref_count(folio), folio); + VM_BUG_ON_FOLIO(folio_mapcount(folio), folio); - hugetlb_set_page_subpool(page, NULL); - if (PageAnon(page)) - __ClearPageAnonExclusive(page); - page->mapping = NULL; - restore_reserve = HPageRestoreReserve(page); - ClearHPageRestoreReserve(page); + hugetlb_set_folio_subpool(folio, NULL); + if (folio_test_anon(folio)) + __ClearPageAnonExclusive(&folio->page); + folio->mapping = NULL; + restore_reserve = folio_test_hugetlb_restore_reserve(folio); + folio_clear_hugetlb_restore_reserve(folio); /* * If HPageRestoreReserve was set on page, page allocation consumed a @@ -1731,26 +1892,26 @@ void free_huge_page(struct page *page) } spin_lock_irqsave(&hugetlb_lock, flags); - ClearHPageMigratable(page); - hugetlb_cgroup_uncharge_page(hstate_index(h), - pages_per_huge_page(h), page); - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + folio_clear_hugetlb_migratable(folio); + hugetlb_cgroup_uncharge_folio(hstate_index(h), + pages_per_huge_page(h), folio); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); if (restore_reserve) h->resv_huge_pages++; - if (HPageTemporary(page)) { - remove_hugetlb_page(h, page, false); + if (folio_test_hugetlb_temporary(folio)) { + remove_hugetlb_folio(h, folio, false); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ - remove_hugetlb_page(h, page, true); + remove_hugetlb_folio(h, folio, true); spin_unlock_irqrestore(&hugetlb_lock, flags); - update_and_free_page(h, page, true); + update_and_free_hugetlb_folio(h, folio, true); } else { arch_clear_hugepage_flags(page); - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } } @@ -1765,36 +1926,37 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid) h->nr_huge_pages_node[nid]++; } -static void __prep_new_huge_page(struct hstate *h, struct page *page) +static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { - hugetlb_vmemmap_optimize(h, page); - INIT_LIST_HEAD(&page->lru); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - hugetlb_set_page_subpool(page, NULL); - set_hugetlb_cgroup(page, NULL); - set_hugetlb_cgroup_rsvd(page, NULL); + hugetlb_vmemmap_optimize(h, &folio->page); + INIT_LIST_HEAD(&folio->lru); + folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + hugetlb_set_folio_subpool(folio, NULL); + set_hugetlb_cgroup(folio, NULL); + set_hugetlb_cgroup_rsvd(folio, NULL); } -static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) +static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int nid) { - __prep_new_huge_page(h, page); + __prep_new_hugetlb_folio(h, folio); spin_lock_irq(&hugetlb_lock); __prep_account_new_huge_page(h, nid); spin_unlock_irq(&hugetlb_lock); } -static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, - bool demote) +static bool __prep_compound_gigantic_folio(struct folio *folio, + unsigned int order, bool demote) { int i, j; int nr_pages = 1 << order; struct page *p; - /* we rely on prep_new_huge_page to set the destructor */ - set_compound_order(page, order); - __SetPageHead(page); + __folio_clear_reserved(folio); + __folio_set_head(folio); + /* we rely on prep_new_hugetlb_folio to set the destructor */ + folio_set_compound_order(folio, order); for (i = 0; i < nr_pages; i++) { - p = nth_page(page, i); + p = folio_page(folio, i); /* * For gigantic hugepages allocated through bootmem at @@ -1808,7 +1970,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, * on the head page when they need know if put_page() is needed * after get_user_pages(). */ - __ClearPageReserved(p); + if (i != 0) /* head page cleared above */ + __ClearPageReserved(p); /* * Subtle and very unlikely * @@ -1835,42 +1998,41 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, VM_BUG_ON_PAGE(page_count(p), p); } if (i != 0) - set_compound_head(p, page); + set_compound_head(p, &folio->page); } - atomic_set(compound_mapcount_ptr(page), -1); - atomic_set(compound_pincount_ptr(page), 0); + atomic_set(folio_mapcount_ptr(folio), -1); + atomic_set(folio_subpages_mapcount_ptr(folio), 0); + atomic_set(folio_pincount_ptr(folio), 0); return true; out_error: /* undo page modifications made above */ for (j = 0; j < i; j++) { - p = nth_page(page, j); + p = folio_page(folio, j); if (j != 0) clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ for (; j < nr_pages; j++) { - p = nth_page(page, j); + p = folio_page(folio, j); __ClearPageReserved(p); } - set_compound_order(page, 0); -#ifdef CONFIG_64BIT - page[1].compound_nr = 0; -#endif - __ClearPageHead(page); + folio_set_compound_order(folio, 0); + __folio_clear_head(folio); return false; } -static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +static bool prep_compound_gigantic_folio(struct folio *folio, + unsigned int order) { - return __prep_compound_gigantic_page(page, order, false); + return __prep_compound_gigantic_folio(folio, order, false); } -static bool prep_compound_gigantic_page_for_demote(struct page *page, +static bool prep_compound_gigantic_folio_for_demote(struct folio *folio, unsigned int order) { - return __prep_compound_gigantic_page(page, order, true); + return __prep_compound_gigantic_folio(folio, order, true); } /* @@ -1935,7 +2097,7 @@ pgoff_t hugetlb_basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_buddy_huge_page(struct hstate *h, +static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { @@ -1973,11 +2135,6 @@ retry: page = NULL; } - if (page) - __count_vm_event(HTLB_BUDDY_PGALLOC); - else - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - /* * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this * indicates an overall state change. Clear bit so that we resume @@ -1994,7 +2151,13 @@ retry: if (node_alloc_noretry && !page && alloc_try_hard) node_set(nid, *node_alloc_noretry); - return page; + if (!page) { + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + return NULL; + } + + __count_vm_event(HTLB_BUDDY_PGALLOC); + return page_folio(page); } /* @@ -2004,29 +2167,28 @@ retry: * Note that returned page is 'frozen': ref count of head page and all tail * pages is zero. */ -static struct page *alloc_fresh_huge_page(struct hstate *h, +static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; bool retry = false; retry: if (hstate_is_gigantic(h)) - page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask); else - page = alloc_buddy_huge_page(h, gfp_mask, + folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry); - if (!page) + if (!folio) return NULL; - if (hstate_is_gigantic(h)) { - if (!prep_compound_gigantic_page(page, huge_page_order(h))) { + if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) { /* * Rare failure to convert pages to compound page. * Free pages and try again - ONCE! */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); if (!retry) { retry = true; goto retry; @@ -2034,9 +2196,9 @@ retry: return NULL; } } - prep_new_huge_page(h, page, page_to_nid(page)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - return page; + return folio; } /* @@ -2046,23 +2208,20 @@ retry: static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, nodemask_t *node_alloc_noretry) { - struct page *page; + struct folio *folio; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed, - node_alloc_noretry); - if (page) - break; + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, + nodes_allowed, node_alloc_noretry); + if (folio) { + free_huge_page(&folio->page); /* free it into the hugepage allocator */ + return 1; + } } - if (!page) - return 0; - - free_huge_page(page); /* free it into the hugepage allocator */ - - return 1; + return 0; } /* @@ -2078,6 +2237,7 @@ static struct page *remove_pool_huge_page(struct hstate *h, { int nr_nodes, node; struct page *page = NULL; + struct folio *folio; lockdep_assert_held(&hugetlb_lock); for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { @@ -2089,7 +2249,8 @@ static struct page *remove_pool_huge_page(struct hstate *h, !list_empty(&h->hugepage_freelists[node])) { page = list_entry(h->hugepage_freelists[node].next, struct page, lru); - remove_hugetlb_page(h, page, acct_surplus); + folio = page_folio(page); + remove_hugetlb_folio(h, folio, acct_surplus); break; } } @@ -2114,21 +2275,21 @@ static struct page *remove_pool_huge_page(struct hstate *h, int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; + struct folio *folio = page_folio(page); retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ - if (!PageHuge(page)) + if (!folio_test_hugetlb(folio)) return 0; spin_lock_irq(&hugetlb_lock); - if (!PageHuge(page)) { + if (!folio_test_hugetlb(folio)) { rc = 0; goto out; } - if (!page_count(page)) { - struct page *head = compound_head(page); - struct hstate *h = page_hstate(head); + if (!folio_ref_count(folio)) { + struct hstate *h = folio_hstate(folio); if (!available_huge_pages(h)) goto out; @@ -2136,7 +2297,7 @@ retry: * We should make sure that the page is already on the free list * when it is dissolved. */ - if (unlikely(!HPageFreed(head))) { + if (unlikely(!folio_test_hugetlb_freed(folio))) { spin_unlock_irq(&hugetlb_lock); cond_resched(); @@ -2151,24 +2312,24 @@ retry: goto retry; } - remove_hugetlb_page(h, head, false); + remove_hugetlb_folio(h, folio, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); /* - * Normally update_and_free_page will allocate required vmemmmap - * before freeing the page. update_and_free_page will fail to + * Normally update_and_free_hugtlb_folio will allocate required vmemmmap + * before freeing the page. update_and_free_hugtlb_folio will fail to * free the page if it can not allocate required vmemmap. We * need to adjust max_huge_pages if the page is not freed. * Attempt to allocate vmemmmap here so that we can take * appropriate action on failure. */ - rc = hugetlb_vmemmap_restore(h, head); + rc = hugetlb_vmemmap_restore(h, &folio->page); if (!rc) { - update_and_free_page(h, head, false); + update_and_free_hugetlb_folio(h, folio, false); } else { spin_lock_irq(&hugetlb_lock); - add_hugetlb_page(h, head, false); + add_hugetlb_folio(h, folio, false); h->max_huge_pages++; spin_unlock_irq(&hugetlb_lock); } @@ -2219,7 +2380,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page = NULL; + struct folio *folio = NULL; if (hstate_is_gigantic(h)) return NULL; @@ -2229,8 +2390,8 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; spin_lock_irq(&hugetlb_lock); @@ -2242,43 +2403,42 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); - free_huge_page(page); + free_huge_page(&folio->page); return NULL; } h->surplus_huge_pages++; - h->surplus_huge_pages_node[page_to_nid(page)]++; + h->surplus_huge_pages_node[folio_nid(folio)]++; out_unlock: spin_unlock_irq(&hugetlb_lock); - return page; + return &folio->page; } static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; + struct folio *folio; if (hstate_is_gigantic(h)) return NULL; - page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); - if (!page) + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + if (!folio) return NULL; /* fresh huge pages are frozen */ - set_page_refcounted(page); - + folio_ref_unfreeze(folio, 1); /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - SetHPageTemporary(page); + folio_set_hugetlb_temporary(folio); - return page; + return &folio->page; } /* @@ -2420,7 +2580,7 @@ retry: if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ - enqueue_huge_page(h, page); + enqueue_hugetlb_folio(h, page_folio(page)); } free: spin_unlock_irq(&hugetlb_lock); @@ -2727,51 +2887,52 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, } /* - * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one + * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve + * the old one * @h: struct hstate old page belongs to - * @old_page: Old page to dissolve + * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, - struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, + struct folio *old_folio, struct list_head *list) { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - int nid = page_to_nid(old_page); - struct page *new_page; + int nid = folio_nid(old_folio); + struct folio *new_folio; int ret = 0; /* - * Before dissolving the page, we need to allocate a new one for the - * pool to remain stable. Here, we allocate the page and 'prep' it + * Before dissolving the folio, we need to allocate a new one for the + * pool to remain stable. Here, we allocate the folio and 'prep' it * by doing everything but actually updating counters and adding to * the pool. This simplifies and let us do most of the processing * under the lock. */ - new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); - if (!new_page) + new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); + if (!new_folio) return -ENOMEM; - __prep_new_huge_page(h, new_page); + __prep_new_hugetlb_folio(h, new_folio); retry: spin_lock_irq(&hugetlb_lock); - if (!PageHuge(old_page)) { + if (!folio_test_hugetlb(old_folio)) { /* - * Freed from under us. Drop new_page too. + * Freed from under us. Drop new_folio too. */ goto free_new; - } else if (page_count(old_page)) { + } else if (folio_ref_count(old_folio)) { /* - * Someone has grabbed the page, try to isolate it here. + * Someone has grabbed the folio, try to isolate it here. * Fail with -EBUSY if not possible. */ spin_unlock_irq(&hugetlb_lock); - ret = isolate_hugetlb(old_page, list); + ret = isolate_hugetlb(&old_folio->page, list); spin_lock_irq(&hugetlb_lock); goto free_new; - } else if (!HPageFreed(old_page)) { + } else if (!folio_test_hugetlb_freed(old_folio)) { /* - * Page's refcount is 0 but it has not been enqueued in the + * Folio's refcount is 0 but it has not been enqueued in the * freelist yet. Race window is small, so we can succeed here if * we retry. */ @@ -2780,35 +2941,35 @@ retry: goto retry; } else { /* - * Ok, old_page is still a genuine free hugepage. Remove it from + * Ok, old_folio is still a genuine free hugepage. Remove it from * the freelist and decrease the counters. These will be * incremented again when calling __prep_account_new_huge_page() - * and enqueue_huge_page() for new_page. The counters will remain - * stable since this happens under the lock. + * and enqueue_hugetlb_folio() for new_folio. The counters will + * remain stable since this happens under the lock. */ - remove_hugetlb_page(h, old_page, false); + remove_hugetlb_folio(h, old_folio, false); /* - * Ref count on new page is already zero as it was dropped + * Ref count on new_folio is already zero as it was dropped * earlier. It can be directly added to the pool free list. */ __prep_account_new_huge_page(h, nid); - enqueue_huge_page(h, new_page); + enqueue_hugetlb_folio(h, new_folio); /* - * Pages have been replaced, we can safely free the old one. + * Folio has been replaced, we can safely free the old one. */ spin_unlock_irq(&hugetlb_lock); - update_and_free_page(h, old_page, false); + update_and_free_hugetlb_folio(h, old_folio, false); } return ret; free_new: spin_unlock_irq(&hugetlb_lock); - /* Page has a zero ref count, but needs a ref to be freed */ - set_page_refcounted(new_page); - update_and_free_page(h, new_page, false); + /* Folio has a zero ref count, but needs a ref to be freed */ + folio_ref_unfreeze(new_folio, 1); + update_and_free_hugetlb_folio(h, new_folio, false); return ret; } @@ -2816,7 +2977,7 @@ free_new: int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { struct hstate *h; - struct page *head; + struct folio *folio = page_folio(page); int ret = -EBUSY; /* @@ -2825,9 +2986,8 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) * Return success when racing as if we dissolved the page ourselves. */ spin_lock_irq(&hugetlb_lock); - if (PageHuge(page)) { - head = compound_head(page); - h = page_hstate(head); + if (folio_test_hugetlb(folio)) { + h = folio_hstate(folio); } else { spin_unlock_irq(&hugetlb_lock); return 0; @@ -2842,10 +3002,10 @@ int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) if (hstate_is_gigantic(h)) return -ENOMEM; - if (page_count(head) && !isolate_hugetlb(head, list)) + if (folio_ref_count(folio) && !isolate_hugetlb(&folio->page, list)) ret = 0; - else if (!page_count(head)) - ret = alloc_and_dissolve_huge_page(h, head, list); + else if (!folio_ref_count(folio)) + ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); return ret; } @@ -2856,6 +3016,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; + struct folio *folio; long map_chg, map_commit; long gbl_chg; int ret, idx; @@ -2924,15 +3085,16 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { SetHPageRestoreReserve(page); h->resv_huge_pages--; } - spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); /* Fall through */ } + folio = page_folio(page); hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page. @@ -2962,8 +3124,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, rsv_adjust = hugepage_subpool_put_pages(spool, 1); hugetlb_acct_memory(h, -rsv_adjust); if (deferred_reserve) - hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), - pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), + pages_per_huge_page(h), folio); } return page; @@ -3028,17 +3190,18 @@ static void __init gather_bootmem_prealloc(void) list_for_each_entry(m, &huge_boot_pages, list) { struct page *page = virt_to_page(m); + struct folio *folio = page_folio(page); struct hstate *h = m->hstate; VM_BUG_ON(!hstate_is_gigantic(h)); - WARN_ON(page_count(page) != 1); - if (prep_compound_gigantic_page(page, huge_page_order(h))) { - WARN_ON(PageReserved(page)); - prep_new_huge_page(h, page, page_to_nid(page)); + WARN_ON(folio_ref_count(folio) != 1); + if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { + WARN_ON(folio_test_reserved(folio)); + prep_new_hugetlb_folio(h, folio, folio_nid(folio)); free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ - free_gigantic_page(page, huge_page_order(h)); + free_gigantic_folio(folio, huge_page_order(h)); } /* @@ -3060,14 +3223,14 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) if (!alloc_bootmem_huge_page(h, nid)) break; } else { - struct page *page; + struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - page = alloc_fresh_huge_page(h, gfp_mask, nid, + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, &node_states[N_MEMORY], NULL); - if (!page) + if (!folio) break; - free_huge_page(page); /* free it into the hugepage allocator */ + free_huge_page(&folio->page); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3212,7 +3375,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, goto out; if (PageHighMem(page)) continue; - remove_hugetlb_page(h, page, false); + remove_hugetlb_folio(h, page_folio(page), false); list_add(&page->lru, &page_list); } } @@ -3417,12 +3580,13 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) { int i, nid = page_to_nid(page); struct hstate *target_hstate; + struct folio *folio = page_folio(page); struct page *subpage; int rc = 0; target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order); - remove_hugetlb_page_for_demote(h, page, false); + remove_hugetlb_folio_for_demote(h, folio, false); spin_unlock_irq(&hugetlb_lock); rc = hugetlb_vmemmap_restore(h, page); @@ -3430,15 +3594,15 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) /* Allocation of vmemmmap failed, we can not demote page */ spin_lock_irq(&hugetlb_lock); set_page_refcounted(page); - add_hugetlb_page(h, page, false); + add_hugetlb_folio(h, page_folio(page), false); return rc; } /* - * Use destroy_compound_hugetlb_page_for_demote for all huge page + * Use destroy_compound_hugetlb_folio_for_demote for all huge page * sizes as it will not ref count pages. */ - destroy_compound_hugetlb_page_for_demote(page, huge_page_order(h)); + destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h)); /* * Taking target hstate mutex synchronizes with set_max_huge_pages. @@ -3452,13 +3616,14 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) for (i = 0; i < pages_per_huge_page(h); i += pages_per_huge_page(target_hstate)) { subpage = nth_page(page, i); + folio = page_folio(subpage); if (hstate_is_gigantic(target_hstate)) - prep_compound_gigantic_page_for_demote(subpage, + prep_compound_gigantic_folio_for_demote(folio, target_hstate->order); else prep_compound_page(subpage, target_hstate->order); set_page_private(subpage, 0); - prep_new_huge_page(target_hstate, subpage, nid); + prep_new_hugetlb_folio(target_hstate, folio, nid); free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); @@ -4601,6 +4766,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) struct resv_map *resv = vma_resv_map(vma); /* + * HPAGE_RESV_OWNER indicates a private mapping. * This new VMA should share its siblings reservation map if present. * The VMA will only ever have a valid reservation map pointer where * it is being copied for another still existing VMA. As that VMA @@ -4615,11 +4781,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) /* * vma_lock structure for sharable mappings is vma specific. - * Clear old pointer (if copied via vm_area_dup) and create new. + * Clear old pointer (if copied via vm_area_dup) and allocate + * new structure. Before clearing, make sure vma_lock is not + * for this vma. */ if (vma->vm_flags & VM_MAYSHARE) { - vma->vm_private_data = NULL; - hugetlb_vma_lock_alloc(vma); + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (vma_lock) { + if (vma_lock->vma != vma) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } else + pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__); + } else + hugetlb_vma_lock_alloc(vma); } } @@ -4756,7 +4932,6 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr hugepage_add_new_anon_rmap(new_page, vma, addr); set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); - ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); } @@ -5045,7 +5220,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - struct mmu_notifier_range range; unsigned long last_addr_mask; bool force_flush = false; @@ -5060,13 +5234,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_change_page_size(tlb, sz); tlb_start_vma(tlb, vma); - /* - * If sharing possible, alert mmu notifiers of worst case. - */ - mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start, - end); - adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); - mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); address = start; for (; address < end; address += sz) { @@ -5096,7 +5263,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to @@ -5108,7 +5274,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); else -#endif huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; @@ -5137,13 +5302,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); -#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); -#endif hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); @@ -5155,7 +5318,6 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct if (ref_page) break; } - mmu_notifier_invalidate_range_end(&range); tlb_end_vma(tlb, vma); /* @@ -5183,29 +5345,43 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + /* mmu notification performed in caller */ __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); - /* - * Unlock and free the vma lock before releasing i_mmap_rwsem. When - * the vma_lock is freed, this makes the vma ineligible for pmd - * sharing. And, i_mmap_rwsem is required to set up pmd sharing. - * This is important as page tables for this unmapped range will - * be asynchrously deleted. If the page tables are shared, there - * will be issues when accessed by someone else. - */ - __hugetlb_vma_unlock_write_free(vma); - - i_mmap_unlock_write(vma->vm_file->f_mapping); + if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ + /* + * Unlock and free the vma lock before releasing i_mmap_rwsem. + * When the vma_lock is freed, this makes the vma ineligible + * for pmd sharing. And, i_mmap_rwsem is required to set up + * pmd sharing. This is important as page tables for this + * unmapped range will be asynchrously deleted. If the page + * tables are shared, there will be issues when accessed by + * someone else. + */ + __hugetlb_vma_unlock_write_free(vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } else { + i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); + } } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + struct mmu_notifier_range range; struct mmu_gather tlb; + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, + start, end); + adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end); + mmu_notifier_invalidate_range_start(&range); tlb_gather_mmu(&tlb, vma->vm_mm); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags); + + mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -5284,9 +5460,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr = address & huge_page_mask(h); struct mmu_notifier_range range; - VM_BUG_ON(unshare && (flags & FOLL_WRITE)); - VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); - /* * hugetlb does not support FOLL_FORCE-style write faults that keep the * PTE mapped R/O such as maybe_mkwrite() would do. @@ -5296,8 +5469,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Let's take out MAP_SHARED mappings first. */ if (vma->vm_flags & VM_MAYSHARE) { - if (unlikely(unshare)) - return 0; set_huge_ptep_writable(vma, haddr, ptep); return 0; } @@ -5419,8 +5590,6 @@ retry_avoidcopy: spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && pte_same(huge_ptep_get(ptep), pte))) { - ClearHPageRestoreReserve(new_page); - /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); @@ -5715,10 +5884,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (!pte_same(huge_ptep_get(ptep), old_pte)) goto backout; - if (anon_rmap) { - ClearHPageRestoreReserve(page); + if (anon_rmap) hugepage_add_new_anon_rmap(page, vma, haddr); - } else + else page_dup_file_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); @@ -6092,6 +6260,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lock(h, dst_mm, dst_pte); + ret = -EIO; + if (PageHWPoison(page)) + goto out_release_unlock; + /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache @@ -6101,12 +6273,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; - if (page_in_pagecache) { + if (page_in_pagecache) page_dup_file_rmap(page, true); - } else { - ClearHPageRestoreReserve(page); + else hugepage_add_new_anon_rmap(page, dst_vma, dst_addr); - } /* * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY @@ -6171,7 +6341,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, } } -static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, +static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, + unsigned int flags, pte_t *pte, bool *unshare) { pte_t pteval = huge_ptep_get(pte); @@ -6183,13 +6354,69 @@ static inline bool __follow_hugetlb_must_fault(unsigned int flags, pte_t *pte, return false; if (flags & FOLL_WRITE) return true; - if (gup_must_unshare(flags, pte_page(pteval))) { + if (gup_must_unshare(vma, flags, pte_page(pteval))) { *unshare = true; return true; } return false; } +struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + struct mm_struct *mm = vma->vm_mm; + unsigned long haddr = address & huge_page_mask(h); + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte, entry; + + /* + * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via + * follow_hugetlb_page(). + */ + if (WARN_ON_ONCE(flags & FOLL_PIN)) + return NULL; + +retry: + pte = huge_pte_offset(mm, haddr, huge_page_size(h)); + if (!pte) + return NULL; + + ptl = huge_pte_lock(h, mm, pte); + entry = huge_ptep_get(pte); + if (pte_present(entry)) { + page = pte_page(entry) + + ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* + * Note that page may be a sub-page, and with vmemmap + * optimizations the page struct may be read only. + * try_grab_page() will increase the ref count on the + * head page, so this will be OK. + * + * try_grab_page() should always be able to get the page here, + * because we hold the ptl lock and have verified pte_present(). + */ + if (try_grab_page(page, flags)) { + page = NULL; + goto out; + } + } else { + if (is_hugetlb_entry_migration(entry)) { + spin_unlock(ptl); + __migration_entry_wait_huge(pte, ptl); + goto retry; + } + /* + * hwpoisoned entry is treated as no_page_table in + * follow_page_mask(). + */ + } +out: + spin_unlock(ptl); + return page; +} + long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, unsigned long *nr_pages, @@ -6256,7 +6483,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * directly from any kind of swap entries. */ if (absent || - __follow_hugetlb_must_fault(flags, pte, &unshare)) { + __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { vm_fault_t ret; unsigned int fault_flags = 0; @@ -6266,9 +6493,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, fault_flags |= FAULT_FLAG_WRITE; else if (unshare) fault_flags |= FAULT_FLAG_UNSHARE; - if (locked) + if (locked) { fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + if (flags & FOLL_INTERRUPTIBLE) + fault_flags |= FAULT_FLAG_INTERRUPTIBLE; + } if (flags & FOLL_NOWAIT) fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; @@ -6342,8 +6572,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * tables. If the huge page is present, then the tail * pages must also be present. The ptl prevents the * head page and tail pages from being rearranged in - * any way. So this page must be available at this - * point, unless the page refcount overflowed: + * any way. As this is hugetlb, the pages will never + * be p2pdma or not longterm pinable. So this page + * must be available at this point, unless the page + * refcount overflowed: */ if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, flags))) { @@ -6527,7 +6759,8 @@ bool hugetlb_reserve_pages(struct inode *inode, } /* - * vma specific semaphore used for pmd sharing synchronization + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization */ hugetlb_vma_lock_alloc(vma); @@ -6783,149 +7016,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, *end = ALIGN(*end, PUD_SIZE); } -static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma) -{ - return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && - vma->vm_private_data; -} - -void hugetlb_vma_lock_read(struct vm_area_struct *vma) -{ - if (__vma_shareable_flags_pmd(vma)) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - down_read(&vma_lock->rw_sema); - } -} - -void hugetlb_vma_unlock_read(struct vm_area_struct *vma) -{ - if (__vma_shareable_flags_pmd(vma)) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - up_read(&vma_lock->rw_sema); - } -} - -void hugetlb_vma_lock_write(struct vm_area_struct *vma) -{ - if (__vma_shareable_flags_pmd(vma)) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - down_write(&vma_lock->rw_sema); - } -} - -void hugetlb_vma_unlock_write(struct vm_area_struct *vma) -{ - if (__vma_shareable_flags_pmd(vma)) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - up_write(&vma_lock->rw_sema); - } -} - -int hugetlb_vma_trylock_write(struct vm_area_struct *vma) -{ - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - if (!__vma_shareable_flags_pmd(vma)) - return 1; - - return down_write_trylock(&vma_lock->rw_sema); -} - -void hugetlb_vma_assert_locked(struct vm_area_struct *vma) -{ - if (__vma_shareable_flags_pmd(vma)) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - lockdep_assert_held(&vma_lock->rw_sema); - } -} - -void hugetlb_vma_lock_release(struct kref *kref) -{ - struct hugetlb_vma_lock *vma_lock = container_of(kref, - struct hugetlb_vma_lock, refs); - - kfree(vma_lock); -} - -static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) -{ - struct vm_area_struct *vma = vma_lock->vma; - - /* - * vma_lock structure may or not be released as a result of put, - * it certainly will no longer be attached to vma so clear pointer. - * Semaphore synchronizes access to vma_lock->vma field. - */ - vma_lock->vma = NULL; - vma->vm_private_data = NULL; - up_write(&vma_lock->rw_sema); - kref_put(&vma_lock->refs, hugetlb_vma_lock_release); -} - -static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) -{ - if (__vma_shareable_flags_pmd(vma)) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - __hugetlb_vma_unlock_write_put(vma_lock); - } -} - -static void hugetlb_vma_lock_free(struct vm_area_struct *vma) -{ - /* - * Only present in sharable vmas. - */ - if (!vma || !__vma_shareable_flags_pmd(vma)) - return; - - if (vma->vm_private_data) { - struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - - down_write(&vma_lock->rw_sema); - __hugetlb_vma_unlock_write_put(vma_lock); - } -} - -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) -{ - struct hugetlb_vma_lock *vma_lock; - - /* Only establish in (flags) sharable vmas */ - if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; - - /* Should never get here with non-NULL vm_private_data */ - if (vma->vm_private_data) - return; - - vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); - if (!vma_lock) { - /* - * If we can not allocate structure, then vma can not - * participate in pmd sharing. This is only a possible - * performance enhancement and memory saving issue. - * However, the lock is also used to synchronize page - * faults with truncation. If the lock is not present, - * unlikely races could leave pages in a file past i_size - * until the file is removed. Warn in the unlikely case of - * allocation failure. - */ - pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; - } - - kref_init(&vma_lock->refs); - init_rwsem(&vma_lock->rw_sema); - vma_lock->vma = vma; - vma->vm_private_data = vma_lock; -} - /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -7014,47 +7104,6 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ -void hugetlb_vma_lock_read(struct vm_area_struct *vma) -{ -} - -void hugetlb_vma_unlock_read(struct vm_area_struct *vma) -{ -} - -void hugetlb_vma_lock_write(struct vm_area_struct *vma) -{ -} - -void hugetlb_vma_unlock_write(struct vm_area_struct *vma) -{ -} - -int hugetlb_vma_trylock_write(struct vm_area_struct *vma) -{ - return 1; -} - -void hugetlb_vma_assert_locked(struct vm_area_struct *vma) -{ -} - -void hugetlb_vma_lock_release(struct kref *kref) -{ -} - -static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) -{ -} - -static void hugetlb_vma_lock_free(struct vm_area_struct *vma) -{ -} - -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) -{ -} - pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { @@ -7182,122 +7231,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h) * These functions are overwritable if your architecture needs its own * behavior. */ -struct page * __weak -follow_huge_addr(struct mm_struct *mm, unsigned long address, - int write) -{ - return ERR_PTR(-EINVAL); -} - -struct page * __weak -follow_huge_pd(struct vm_area_struct *vma, - unsigned long address, hugepd_t hpd, int flags, int pdshift) -{ - WARN(1, "hugepd follow called with no support for hugepage directory format\n"); - return NULL; -} - -struct page * __weak -follow_huge_pmd_pte(struct vm_area_struct *vma, unsigned long address, int flags) -{ - struct hstate *h = hstate_vma(vma); - struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; - spinlock_t *ptl; - pte_t *ptep, pte; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptep = huge_pte_offset(mm, address, huge_page_size(h)); - if (!ptep) - return NULL; - - ptl = huge_pte_lock(h, mm, ptep); - pte = huge_ptep_get(ptep); - if (pte_present(pte)) { - page = pte_page(pte) + - ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); - /* - * try_grab_page() should always succeed here, because: a) we - * hold the pmd (ptl) lock, and b) we've just checked that the - * huge pmd (head) page is present in the page tables. The ptl - * prevents the head page and tail pages from being rearranged - * in any way. So this page must be available at this point, - * unless the page refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait_huge(ptep, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pud(struct mm_struct *mm, unsigned long address, - pud_t *pud, int flags) -{ - struct page *page = NULL; - spinlock_t *ptl; - pte_t pte; - - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; - -retry: - ptl = huge_pte_lock(hstate_sizelog(PUD_SHIFT), mm, (pte_t *)pud); - if (!pud_huge(*pud)) - goto out; - pte = huge_ptep_get((pte_t *)pud); - if (pte_present(pte)) { - page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(!try_grab_page(page, flags))) { - page = NULL; - goto out; - } - } else { - if (is_hugetlb_entry_migration(pte)) { - spin_unlock(ptl); - __migration_entry_wait(mm, (pte_t *)pud, ptl); - goto retry; - } - /* - * hwpoisoned entry is treated as no_page_table in - * follow_page_mask(). - */ - } -out: - spin_unlock(ptl); - return page; -} - -struct page * __weak -follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags) -{ - if (flags & (FOLL_GET | FOLL_PIN)) - return NULL; - - return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT); -} - int isolate_hugetlb(struct page *page, struct list_head *list) { int ret = 0; @@ -7316,7 +7249,7 @@ unlock: return ret; } -int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +int get_hwpoison_huge_page(struct page *page, bool *hugetlb, bool unpoison) { int ret = 0; @@ -7326,7 +7259,7 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) *hugetlb = true; if (HPageFreed(page)) ret = 0; - else if (HPageMigratable(page)) + else if (HPageMigratable(page) || unpoison) ret = get_page_unless_zero(page); else ret = -EBUSY; @@ -7335,12 +7268,13 @@ int get_hwpoison_huge_page(struct page *page, bool *hugetlb) return ret; } -int get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { int ret; spin_lock_irq(&hugetlb_lock); - ret = __get_huge_page_for_hwpoison(pfn, flags); + ret = __get_huge_page_for_hwpoison(pfn, flags, migratable_cleared); spin_unlock_irq(&hugetlb_lock); return ret; } @@ -7354,15 +7288,15 @@ void putback_active_hugepage(struct page *page) put_page(page); } -void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason) { - struct hstate *h = page_hstate(oldpage); + struct hstate *h = folio_hstate(old_folio); - hugetlb_cgroup_migrate(oldpage, newpage); - set_page_owner_migrate_reason(newpage, reason); + hugetlb_cgroup_migrate(old_folio, new_folio); + set_page_owner_migrate_reason(&new_folio->page, reason); /* - * transfer temporary state of the new huge page. This is + * transfer temporary state of the new hugetlb folio. This is * reverse to other transitions because the newpage is going to * be final while the old one will be freed so it takes over * the temporary status. @@ -7371,12 +7305,13 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) * here as well otherwise the global surplus count will not match * the per-node's. */ - if (HPageTemporary(newpage)) { - int old_nid = page_to_nid(oldpage); - int new_nid = page_to_nid(newpage); + if (folio_test_hugetlb_temporary(new_folio)) { + int old_nid = folio_nid(old_folio); + int new_nid = folio_nid(new_folio); + + folio_set_hugetlb_temporary(old_folio); + folio_clear_hugetlb_temporary(new_folio); - SetHPageTemporary(oldpage); - ClearHPageTemporary(newpage); /* * There is no need to transfer the per-node surplus state diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index f61d132df52b..d9e4425d81ac 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -191,8 +191,9 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); + struct folio *folio = page_folio(page); - page_hcg = hugetlb_cgroup_from_page(page); + page_hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely @@ -211,7 +212,7 @@ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); - set_hugetlb_cgroup(page, parent); + set_hugetlb_cgroup(folio, parent); out: return; } @@ -309,21 +310,21 @@ int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, - struct page *page, bool rsvd) + struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; - __set_hugetlb_cgroup(page, h_cg, rsvd); + __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } @@ -332,31 +333,35 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, false); + struct folio *folio = page_folio(page); + + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) { - __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, page, true); + struct folio *folio = page_folio(page); + + __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* * Should be called with hugetlb_lock held */ -static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page, bool rsvd) +static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); - h_cg = __hugetlb_cgroup_from_page(page, rsvd); + h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; - __set_hugetlb_cgroup(page, NULL, rsvd); + __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), @@ -366,27 +371,27 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, css_put(&h_cg->css); else { unsigned long usage = - h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ - WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } -void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, false); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } -void hugetlb_cgroup_uncharge_page_rsvd(int idx, unsigned long nr_pages, - struct page *page) +void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, + struct folio *folio) { - __hugetlb_cgroup_uncharge_page(idx, nr_pages, page, true); + __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, @@ -883,25 +888,25 @@ void __init hugetlb_cgroup_file_init(void) * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ -void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; - struct hstate *h = page_hstate(oldhpage); + struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); - h_cg = hugetlb_cgroup_from_page(oldhpage); - h_cg_rsvd = hugetlb_cgroup_from_page_rsvd(oldhpage); - set_hugetlb_cgroup(oldhpage, NULL); - set_hugetlb_cgroup_rsvd(oldhpage, NULL); + h_cg = hugetlb_cgroup_from_folio(old_folio); + h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); + set_hugetlb_cgroup(old_folio, NULL); + set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ - set_hugetlb_cgroup(newhpage, h_cg); - set_hugetlb_cgroup_rsvd(newhpage, h_cg_rsvd); - list_move(&newhpage->lru, &h->hugepage_activelist); + set_hugetlb_cgroup(new_folio, h_cg); + set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); + list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; } diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index ba2a2596fb4e..45e93a545dd7 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> +#include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> @@ -202,12 +203,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end); - /* - * We only change the mapping of the vmemmap virtual address range - * [@start + PAGE_SIZE, end), so we only need to flush the TLB which - * belongs to the range. - */ - flush_tlb_kernel_range(start + PAGE_SIZE, end); + flush_tlb_kernel_range(start, end); return 0; } @@ -231,10 +227,8 @@ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; - list_for_each_entry_safe(page, next, list, lru) { - list_del(&page->lru); + list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); - } } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, @@ -245,9 +239,23 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; - pte_t entry = mk_pte(walk->reuse_page, pgprot); struct page *page = pte_page(*pte); + pte_t entry; + /* Remapping the head page requires r/w */ + if (unlikely(addr == walk->reuse_addr)) { + pgprot = PAGE_KERNEL; + list_del(&walk->reuse_page->lru); + + /* + * Makes sure that preceding stores to the page contents from + * vmemmap_remap_free() become visible before the set_pte_at() + * write. + */ + smp_wmb(); + } + + entry = mk_pte(walk->reuse_page, pgprot); list_add_tail(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } @@ -316,6 +324,24 @@ static int vmemmap_remap_free(unsigned long start, unsigned long end, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, }; + int nid = page_to_nid((struct page *)start); + gfp_t gfp_mask = GFP_KERNEL | __GFP_THISNODE | __GFP_NORETRY | + __GFP_NOWARN; + + /* + * Allocate a new head vmemmap page to avoid breaking a contiguous + * block of struct page memory when freeing it back to page allocator + * in free_vmemmap_page_list(). This will allow the likely contiguous + * struct page backing memory to be kept contiguous and allowing for + * more allocations of hugepages. Fallback to the currently + * mapped head page in case should it fail to allocate. + */ + walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); + if (walk.reuse_page) { + copy_page(page_to_virt(walk.reuse_page), + (void *)walk.reuse_addr); + list_add(&walk.reuse_page->lru, &vmemmap_pages); + } /* * In order to make remapping routine most efficient for the huge pages, diff --git a/mm/internal.h b/mm/internal.h index 6b7ef495b56d..bcf75a8b032d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -106,9 +106,9 @@ static inline void force_page_cache_readahead(struct address_space *mapping, force_page_cache_ra(&ractl, nr_to_read); } -unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, +unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); -unsigned find_get_entries(struct address_space *mapping, pgoff_t start, +unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices); void filemap_free_folio(struct address_space *mapping, struct folio *folio); int truncate_inode_folio(struct address_space *mapping, struct folio *folio); @@ -708,14 +708,6 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -#ifdef CONFIG_MEMORY_FAILURE -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages); -#else -static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ -} -#endif - extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d8b5590f9484..b076f597a378 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -450,15 +450,22 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) __memset(alloc_meta, 0, sizeof(*alloc_meta)); } -size_t kasan_metadata_size(struct kmem_cache *cache) +size_t kasan_metadata_size(struct kmem_cache *cache, bool in_object) { + struct kasan_cache *info = &cache->kasan_info; + if (!kasan_requires_meta()) return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - ((cache->kasan_info.free_meta_offset && - cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? - sizeof(struct kasan_free_meta) : 0); + + if (in_object) + return (info->free_meta_offset ? + 0 : sizeof(struct kasan_free_meta)); + else + return (info->alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((info->free_meta_offset && + info->free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); } static void __kasan_record_aux_stack(void *addr, bool can_alloc) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index abbcc1b0eec5..ea8cf1310b1e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -261,14 +261,6 @@ struct kasan_stack_ring { #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -/* Used in KUnit-compatible KASAN tests. */ -struct kunit_kasan_status { - bool report_found; - bool sync_fault; -}; -#endif - #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) static inline const void *kasan_shadow_to_mem(const void *shadow_addr) @@ -549,6 +541,18 @@ static inline bool kasan_arch_is_ready(void) { return true; } #error kasan_arch_is_ready only works in KASAN generic outline mode! #endif +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) + +void kasan_kunit_test_suite_start(void); +void kasan_kunit_test_suite_end(void); + +#else /* CONFIG_KASAN_KUNIT_TEST */ + +static inline void kasan_kunit_test_suite_start(void) { } +static inline void kasan_kunit_test_suite_end(void) { } + +#endif /* CONFIG_KASAN_KUNIT_TEST */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) bool kasan_save_enable_multi_shot(void); diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index 0d59098f0876..74cd80c12b25 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -5,8 +5,12 @@ * Author: Andrey Ryabinin <a.ryabinin@samsung.com> */ +#define pr_fmt(fmt) "kasan_test: " fmt + +#include <kunit/test.h> #include <linux/bitops.h> #include <linux/delay.h> +#include <linux/io.h> #include <linux/kasan.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -14,21 +18,28 @@ #include <linux/module.h> #include <linux/printk.h> #include <linux/random.h> +#include <linux/set_memory.h> #include <linux/slab.h> #include <linux/string.h> +#include <linux/tracepoint.h> #include <linux/uaccess.h> -#include <linux/io.h> #include <linux/vmalloc.h> -#include <linux/set_memory.h> +#include <trace/events/printk.h> #include <asm/page.h> -#include <kunit/test.h> - #include "kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) +static bool multishot; + +/* Fields set based on lines observed in the console. */ +static struct { + bool report_found; + bool async_fault; +} test_status; + /* * Some tests use these global variables to store return values from function * calls that could otherwise be eliminated by the compiler as dead code. @@ -36,35 +47,65 @@ void *kasan_ptr_result; int kasan_int_result; -static struct kunit_resource resource; -static struct kunit_kasan_status test_status; -static bool multishot; +/* Probe for console output: obtains test_status lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + if (strnstr(buf, "BUG: KASAN: ", len)) + WRITE_ONCE(test_status.report_found, true); + else if (strnstr(buf, "Asynchronous fault: ", len)) + WRITE_ONCE(test_status.async_fault, true); +} -/* - * Temporarily enable multi-shot mode. Otherwise, KASAN would only report the - * first detected bug and panic the kernel if panic_on_warn is enabled. For - * hardware tag-based KASAN also allow tag checking to be reenabled for each - * test, see the comment for KUNIT_EXPECT_KASAN_FAIL(). - */ -static int kasan_test_init(struct kunit *test) +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kasan_suite_init(struct kunit_suite *suite) { if (!kasan_enabled()) { - kunit_err(test, "can't run KASAN tests with KASAN disabled"); + pr_err("Can't run KASAN tests with KASAN disabled"); return -1; } + /* Stop failing KUnit tests on KASAN reports. */ + kasan_kunit_test_suite_start(); + + /* + * Temporarily enable multi-shot mode. Otherwise, KASAN would only + * report the first detected bug and panic the kernel if panic_on_warn + * is enabled. + */ multishot = kasan_save_enable_multi_shot(); - test_status.report_found = false; - test_status.sync_fault = false; - kunit_add_named_resource(test, NULL, NULL, &resource, - "kasan_status", &test_status); + + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); return 0; } -static void kasan_test_exit(struct kunit *test) +static void kasan_suite_exit(struct kunit_suite *suite) { + kasan_kunit_test_suite_end(); kasan_restore_multi_shot(multishot); - KUNIT_EXPECT_FALSE(test, test_status.report_found); + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static void kasan_test_exit(struct kunit *test) +{ + KUNIT_EXPECT_FALSE(test, READ_ONCE(test_status.report_found)); } /** @@ -106,11 +147,12 @@ static void kasan_test_exit(struct kunit *test) if (IS_ENABLED(CONFIG_KASAN_HW_TAGS) && \ kasan_sync_fault_possible()) { \ if (READ_ONCE(test_status.report_found) && \ - READ_ONCE(test_status.sync_fault)) \ + !READ_ONCE(test_status.async_fault)) \ kasan_enable_tagging(); \ migrate_enable(); \ } \ WRITE_ONCE(test_status.report_found, false); \ + WRITE_ONCE(test_status.async_fault, false); \ } while (0) #define KASAN_TEST_NEEDS_CONFIG_ON(test, config) do { \ @@ -783,23 +825,30 @@ static void kasan_global_oob_left(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, *(volatile char *)p); } -/* Check that ksize() makes the whole object accessible. */ +/* Check that ksize() does NOT unpoison whole object. */ static void ksize_unpoisons_memory(struct kunit *test) { char *ptr; - size_t size = 123, real_size; + size_t size = 128 - KASAN_GRANULE_SIZE - 5; + size_t real_size; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + real_size = ksize(ptr); + KUNIT_EXPECT_GT(test, real_size, size); OPTIMIZER_HIDE_VAR(ptr); - /* This access shouldn't trigger a KASAN report. */ - ptr[size] = 'x'; + /* These accesses shouldn't trigger a KASAN report. */ + ptr[0] = 'x'; + ptr[size - 1] = 'x'; - /* This one must. */ - KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size]); + /* These must trigger a KASAN report. */ + if (IS_ENABLED(CONFIG_KASAN_GENERIC)) + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[size + 5]); + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr)[real_size - 1]); kfree(ptr); } @@ -1103,6 +1152,67 @@ static void kmalloc_double_kzfree(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kfree_sensitive(ptr)); } +/* + * The two tests below check that Generic KASAN prints auxiliary stack traces + * for RCU callbacks and workqueues. The reports need to be inspected manually. + * + * These tests are still enabled for other KASAN modes to make sure that all + * modes report bad accesses in tested scenarios. + */ + +static struct kasan_rcu_info { + int i; + struct rcu_head rcu; +} *global_rcu_ptr; + +static void rcu_uaf_reclaim(struct rcu_head *rp) +{ + struct kasan_rcu_info *fp = + container_of(rp, struct kasan_rcu_info, rcu); + + kfree(fp); + ((volatile struct kasan_rcu_info *)fp)->i; +} + +static void rcu_uaf(struct kunit *test) +{ + struct kasan_rcu_info *ptr; + + ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + global_rcu_ptr = rcu_dereference_protected( + (struct kasan_rcu_info __rcu *)ptr, NULL); + + KUNIT_EXPECT_KASAN_FAIL(test, + call_rcu(&global_rcu_ptr->rcu, rcu_uaf_reclaim); + rcu_barrier()); +} + +static void workqueue_uaf_work(struct work_struct *work) +{ + kfree(work); +} + +static void workqueue_uaf(struct kunit *test) +{ + struct workqueue_struct *workqueue; + struct work_struct *work; + + workqueue = create_workqueue("kasan_workqueue_test"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, workqueue); + + work = kmalloc(sizeof(struct work_struct), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, work); + + INIT_WORK(work, workqueue_uaf_work); + queue_work(workqueue, work); + destroy_workqueue(workqueue); + + KUNIT_EXPECT_KASAN_FAIL(test, + ((volatile struct work_struct *)work)->data); +} + static void vmalloc_helpers_tags(struct kunit *test) { void *ptr; @@ -1299,7 +1409,7 @@ static void match_all_not_assigned(struct kunit *test) KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); for (i = 0; i < 256; i++) { - size = prandom_u32_max(1024) + 1; + size = get_random_u32_inclusive(1, 1024); ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); @@ -1308,7 +1418,7 @@ static void match_all_not_assigned(struct kunit *test) } for (i = 0; i < 256; i++) { - order = prandom_u32_max(4) + 1; + order = get_random_u32_inclusive(1, 4); pages = alloc_pages(GFP_KERNEL, order); ptr = page_address(pages); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); @@ -1321,7 +1431,7 @@ static void match_all_not_assigned(struct kunit *test) return; for (i = 0; i < 256; i++) { - size = prandom_u32_max(1024) + 1; + size = get_random_u32_inclusive(1, 1024); ptr = vmalloc(size); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); KUNIT_EXPECT_GE(test, (u8)get_tag(ptr), (u8)KASAN_TAG_MIN); @@ -1434,6 +1544,8 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), KUNIT_CASE(kmalloc_double_kzfree), + KUNIT_CASE(rcu_uaf), + KUNIT_CASE(workqueue_uaf), KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), @@ -1447,9 +1559,10 @@ static struct kunit_case kasan_kunit_test_cases[] = { static struct kunit_suite kasan_kunit_test_suite = { .name = "kasan", - .init = kasan_test_init, .test_cases = kasan_kunit_test_cases, .exit = kasan_test_exit, + .suite_init = kasan_suite_init, + .suite_exit = kasan_suite_exit, }; kunit_test_suite(kasan_kunit_test_suite); diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c index e4ca82dc2c16..7be7bed456ef 100644 --- a/mm/kasan/kasan_test_module.c +++ b/mm/kasan/kasan_test_module.c @@ -62,64 +62,6 @@ static noinline void __init copy_user_test(void) kfree(kmem); } -static struct kasan_rcu_info { - int i; - struct rcu_head rcu; -} *global_rcu_ptr; - -static noinline void __init kasan_rcu_reclaim(struct rcu_head *rp) -{ - struct kasan_rcu_info *fp = container_of(rp, - struct kasan_rcu_info, rcu); - - kfree(fp); - ((volatile struct kasan_rcu_info *)fp)->i; -} - -static noinline void __init kasan_rcu_uaf(void) -{ - struct kasan_rcu_info *ptr; - - pr_info("use-after-free in kasan_rcu_reclaim\n"); - ptr = kmalloc(sizeof(struct kasan_rcu_info), GFP_KERNEL); - if (!ptr) { - pr_err("Allocation failed\n"); - return; - } - - global_rcu_ptr = rcu_dereference_protected(ptr, NULL); - call_rcu(&global_rcu_ptr->rcu, kasan_rcu_reclaim); -} - -static noinline void __init kasan_workqueue_work(struct work_struct *work) -{ - kfree(work); -} - -static noinline void __init kasan_workqueue_uaf(void) -{ - struct workqueue_struct *workqueue; - struct work_struct *work; - - workqueue = create_workqueue("kasan_wq_test"); - if (!workqueue) { - pr_err("Allocation failed\n"); - return; - } - work = kmalloc(sizeof(struct work_struct), GFP_KERNEL); - if (!work) { - pr_err("Allocation failed\n"); - return; - } - - INIT_WORK(work, kasan_workqueue_work); - queue_work(workqueue, work); - destroy_workqueue(workqueue); - - pr_info("use-after-free on workqueue\n"); - ((volatile struct work_struct *)work)->data; -} - static int __init test_kasan_module_init(void) { /* @@ -130,8 +72,6 @@ static int __init test_kasan_module_init(void) bool multishot = kasan_save_enable_multi_shot(); copy_user_test(); - kasan_rcu_uaf(); - kasan_workqueue_uaf(); kasan_restore_multi_shot(multishot); return -EAGAIN; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index df3602062bfd..1d02757e90a3 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -9,6 +9,7 @@ * Andrey Konovalov <andreyknvl@gmail.com> */ +#include <kunit/test.h> #include <linux/bitops.h> #include <linux/ftrace.h> #include <linux/init.h> @@ -30,8 +31,6 @@ #include <asm/sections.h> -#include <kunit/test.h> - #include "kasan.h" #include "../slab.h" @@ -115,40 +114,63 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); #endif #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -static void update_kunit_status(bool sync) + +/* + * Whether the KASAN KUnit test suite is currently being executed. + * Updated in kasan_test.c. + */ +bool kasan_kunit_executing; + +void kasan_kunit_test_suite_start(void) +{ + WRITE_ONCE(kasan_kunit_executing, true); +} +EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start); + +void kasan_kunit_test_suite_end(void) +{ + WRITE_ONCE(kasan_kunit_executing, false); +} +EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end); + +static bool kasan_kunit_test_suite_executing(void) +{ + return READ_ONCE(kasan_kunit_executing); +} + +#else /* CONFIG_KASAN_KUNIT_TEST */ + +static inline bool kasan_kunit_test_suite_executing(void) { return false; } + +#endif /* CONFIG_KASAN_KUNIT_TEST */ + +#if IS_ENABLED(CONFIG_KUNIT) + +static void fail_non_kasan_kunit_test(void) { struct kunit *test; - struct kunit_resource *resource; - struct kunit_kasan_status *status; - test = current->kunit_test; - if (!test) + if (kasan_kunit_test_suite_executing()) return; - resource = kunit_find_named_resource(test, "kasan_status"); - if (!resource) { + test = current->kunit_test; + if (test) kunit_set_failure(test); - return; - } +} - status = (struct kunit_kasan_status *)resource->data; - WRITE_ONCE(status->report_found, true); - WRITE_ONCE(status->sync_fault, sync); +#else /* CONFIG_KUNIT */ - kunit_put_resource(resource); -} -#else -static void update_kunit_status(bool sync) { } -#endif +static inline void fail_non_kasan_kunit_test(void) { } + +#endif /* CONFIG_KUNIT */ static DEFINE_SPINLOCK(report_lock); static void start_report(unsigned long *flags, bool sync) { + fail_non_kasan_kunit_test(); /* Respect the /proc/sys/kernel/traceoff_on_warning interface. */ disable_trace_on_warning(); - /* Update status of the currently running KASAN test. */ - update_kunit_status(sync); /* Do not allow LOCKDEP mangling KASAN reports. */ lockdep_off(); /* Make sure we don't end up in loop. */ @@ -164,8 +186,8 @@ static void end_report(unsigned long *flags, void *addr) (unsigned long)addr); pr_err("==================================================================\n"); spin_unlock_irqrestore(&report_lock, *flags); - if (panic_on_warn && !test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) - panic("panic_on_warn set ...\n"); + if (!test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags)) + check_panic_on_warn("KASAN"); if (kasan_arg_fault == KASAN_ARG_FAULT_PANIC) panic("kasan.fault=panic set ...\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index 0e3648b603a6..2fba1f51f042 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -244,7 +244,7 @@ static int __meminit kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - hotplug_memory_notifier(kasan_mem_notifier, 0); + hotplug_memory_notifier(kasan_mem_notifier, DEFAULT_CALLBACK_PRI); return 0; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 141788858b70..5349c37a5dac 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -26,7 +26,6 @@ #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/sched/clock.h> -#include <linux/sched/sysctl.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/spinlock.h> @@ -360,9 +359,9 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g unsigned long flags; struct slab *slab; void *addr; - const bool random_right_allocate = prandom_u32_max(2); + const bool random_right_allocate = get_random_u32_below(2); const bool random_fault = CONFIG_KFENCE_STRESS_TEST_FAULTS && - !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS); + !get_random_u32_below(CONFIG_KFENCE_STRESS_TEST_FAULTS); /* Try to obtain a free object. */ raw_spin_lock_irqsave(&kfence_freelist_lock, flags); @@ -799,16 +798,7 @@ static void toggle_allocation_gate(struct work_struct *work) /* Enable static key, and await allocation to happen. */ static_branch_enable(&kfence_allocation_key); - if (sysctl_hung_task_timeout_secs) { - /* - * During low activity with no allocations we might wait a - * while; let's avoid the hung task warning. - */ - wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), - sysctl_hung_task_timeout_secs * HZ / 2); - } else { - wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); - } + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); /* Disable static key and reset timer. */ static_branch_disable(&kfence_allocation_key); diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index a97bffe0cc3e..b5d66a69200d 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -532,8 +532,8 @@ static void test_free_bulk(struct kunit *test) int iter; for (iter = 0; iter < 5; iter++) { - const size_t size = setup_test_cache(test, 8 + prandom_u32_max(300), 0, - (iter & 1) ? ctor_set_x : NULL); + const size_t size = setup_test_cache(test, get_random_u32_inclusive(8, 307), + 0, (iter & 1) ? ctor_set_x : NULL); void *objects[] = { test_alloc(test, size, GFP_KERNEL, ALLOCATE_RIGHT), test_alloc(test, size, GFP_KERNEL, ALLOCATE_NONE), diff --git a/mm/kfence/report.c b/mm/kfence/report.c index 7e496856c2eb..60205f1257ef 100644 --- a/mm/kfence/report.c +++ b/mm/kfence/report.c @@ -75,18 +75,23 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfence_") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kfence_") || + str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || !strncmp(buf, ARCH_FUNC_PREFIX "__slab_free", len)) { /* - * In case of tail calls from any of the below - * to any of the above. + * In case of tail calls from any of the below to any of + * the above, optimized by the compiler such that the + * stack trace would omit the initial entry point below. */ fallback = skipnr + 1; } - /* Also the *_bulk() variants by only checking prefixes. */ + /* + * The below list should only include the initial entry points + * into the slab allocators. Includes the *_bulk() variants by + * checking prefixes. + */ if (str_has_prefix(buf, ARCH_FUNC_PREFIX "kfree") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_free") || - str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmem_cache_free") || str_has_prefix(buf, ARCH_FUNC_PREFIX "__kmalloc") || str_has_prefix(buf, ARCH_FUNC_PREFIX "kmem_cache_alloc")) goto found; @@ -268,8 +273,7 @@ void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *r lockdep_on(); - if (panic_on_warn) - panic("panic_on_warn set ...\n"); + check_panic_on_warn("KFENCE"); /* We encountered a memory safety error, taint the kernel! */ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4734315f7940..5cb401aa2b9d 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -97,8 +97,8 @@ struct collapse_control { /* Num pages scanned per node */ u32 node_load[MAX_NUMNODES]; - /* Last target selected in hpage_collapse_find_target_node() */ - int last_target_node; + /* nodemask for allocation fallback */ + nodemask_t alloc_nmask; }; /** @@ -734,7 +734,6 @@ static void khugepaged_alloc_sleep(void) struct collapse_control khugepaged_collapse_control = { .is_khugepaged = true, - .last_target_node = NUMA_NO_NODE, }; static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) @@ -783,16 +782,11 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) target_node = nid; } - /* do some balance if several nodes have the same hit record */ - if (target_node <= cc->last_target_node) - for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == cc->node_load[nid]) { - target_node = nid; - break; - } + for_each_online_node(nid) { + if (max_value == cc->node_load[nid]) + node_set(nid, cc->alloc_nmask); + } - cc->last_target_node = target_node; return target_node; } #else @@ -802,9 +796,10 @@ static int hpage_collapse_find_target_node(struct collapse_control *cc) } #endif -static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, + nodemask_t *nmask) { - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); + *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); return false; @@ -862,7 +857,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, if (!*pmd) return SCAN_PMD_NULL; - pmde = pmd_read_atomic(*pmd); + pmde = pmdp_get_lockless(*pmd); #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ @@ -955,12 +950,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, struct collapse_control *cc) { - /* Only allocate from the target node */ gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : - GFP_TRANSHUGE) | __GFP_THISNODE; + GFP_TRANSHUGE); int node = hpage_collapse_find_target_node(cc); - if (!hpage_collapse_alloc_page(hpage, gfp, node)) + if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask)) return SCAN_ALLOC_HUGE_PAGE_FAIL; if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) return SCAN_CGROUP_CHARGE_FAIL; @@ -1057,6 +1051,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); spin_lock(pte_ptl); result = __collapse_huge_page_isolate(vma, address, pte, cc, @@ -1144,6 +1139,7 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, goto out; memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -1242,15 +1238,8 @@ static int hpage_collapse_scan_pmd(struct mm_struct *mm, /* * Check if the page has any GUP (or other external) pins. * - * Here the check is racy it may see total_mapcount > refcount - * in some cases. - * For example, one process with one forked child process. - * The parent has the PMD split due to MADV_DONTNEED, then - * the child is trying unmap the whole PMD, but khugepaged - * may be scanning the parent between the child has - * PageDoubleMap flag cleared and dec the mapcount. So - * khugepaged may see total_mapcount > refcount. - * + * Here the check may be racy: + * it may see total_mapcount > refcount in some cases? * But such case is ephemeral we could always retry collapse * later. However it may report false positive if the page * has excessive GUP pins (i.e. 512). Anyway the same check @@ -1384,16 +1373,43 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } +/* + * A note about locking: + * Trying to take the page table spinlocks would be useless here because those + * are only used to synchronize: + * + * - modifying terminal entries (ones that point to a data page, not to another + * page table) + * - installing *new* non-terminal entries + * + * Instead, we need roughly the same kind of protection as free_pgtables() or + * mm_take_all_locks() (but only for a single VMA): + * The mmap lock together with this VMA's rmap locks covers all paths towards + * the page table entries we're messing with here, except for hardware page + * table walks and lockless_pages_from_mm(). + */ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - spinlock_t *ptl; pmd_t pmd; + struct mmu_notifier_range range; mmap_assert_write_locked(mm); - ptl = pmd_lock(vma->vm_mm, pmdp); + if (vma->vm_file) + lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); + /* + * All anon_vmas attached to the VMA have the same root and are + * therefore locked by the same lock. + */ + if (vma->anon_vma) + lockdep_assert_held_write(&vma->anon_vma->root->rwsem); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr, + addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); pmd = pmdp_collapse_flush(vma, addr, pmdp); - spin_unlock(ptl); + tlb_remove_table_sync_one(); + mmu_notifier_invalidate_range_end(&range); mm_dec_nr_ptes(mm); page_table_check_pte_clear_range(mm, addr, pmd); pte_free(mm, pmd_pgtable(pmd)); @@ -1444,6 +1460,14 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return SCAN_VMA_CHECK; + /* + * Symmetry with retract_page_tables(): Exclude MAP_PRIVATE mappings + * that got written to. Without this, we'd have to also lock the + * anon_vma if one exists. + */ + if (vma->anon_vma) + return SCAN_VMA_CHECK; + /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) return SCAN_PTE_UFFD_WP; @@ -1477,6 +1501,20 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto drop_hpage; } + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + * See collapse_and_free_pmd(). + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); result = SCAN_FAIL; @@ -1531,6 +1569,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 4: remove pte entries */ collapse_and_free_pmd(mm, vma, haddr, pmd); + i_mmap_unlock_write(vma->vm_file->f_mapping); + maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd @@ -1544,6 +1584,7 @@ drop_hpage: abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; } @@ -1600,7 +1641,8 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) { result = SCAN_PAGE_ANON; @@ -1702,12 +1744,12 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, { struct address_space *mapping = file->f_mapping; struct page *hpage; - pgoff_t index, end = start + HPAGE_PMD_NR; + pgoff_t index = 0, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); int nr_none = 0, result = SCAN_SUCCEED; bool is_shmem = shmem_file(file); - int nr; + int nr = 0; VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); @@ -1747,6 +1789,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, xas_set(&xas, start); for (index = start; index < end; index++) { struct page *page = xas_next(&xas); + struct folio *folio; VM_BUG_ON(index != xas.xa_index); if (is_shmem) { @@ -1773,8 +1816,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, } if (xa_is_value(page) || !PageUptodate(page)) { - struct folio *folio; - xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ if (shmem_get_folio(mapping->host, index, @@ -1862,13 +1903,15 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (page_mapping(page) != mapping) { + folio = page_folio(page); + + if (folio_mapping(folio) != mapping) { result = SCAN_TRUNCATED; goto out_unlock; } - if (!is_shmem && (PageDirty(page) || - PageWriteback(page))) { + if (!is_shmem && (folio_test_dirty(folio) || + folio_test_writeback(folio))) { /* * khugepaged only works on read-only fd, so this * page is dirty because it hasn't been flushed @@ -1878,20 +1921,20 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (isolate_lru_page(page)) { + if (folio_isolate_lru(folio)) { result = SCAN_DEL_PAGE_LRU; goto out_unlock; } - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) { + if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; - putback_lru_page(page); + folio_putback_lru(folio); goto out_unlock; } - if (page_mapped(page)) - try_to_unmap(page_folio(page), + if (folio_mapped(folio)) + try_to_unmap(folio, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); @@ -1970,6 +2013,7 @@ xa_unlocked: if (result == SCAN_SUCCEED) { struct page *page, *tmp; + struct folio *folio; /* * Replacing old pages with new one has succeeded, now we @@ -1997,11 +2041,13 @@ xa_unlocked: index++; } - SetPageUptodate(hpage); - page_ref_add(hpage, HPAGE_PMD_NR - 1); + folio = page_folio(hpage); + folio_mark_uptodate(folio); + folio_ref_add(folio, HPAGE_PMD_NR - 1); + if (is_shmem) - set_page_dirty(hpage); - lru_cache_add(hpage); + folio_mark_dirty(folio); + folio_add_lru(folio); /* * Remove pte page tables, so we can re-fault the page as huge. @@ -2059,7 +2105,8 @@ out: mem_cgroup_uncharge(page_folio(hpage)); put_page(hpage); } - /* TODO: tracepoints */ + + trace_mm_khugepaged_collapse_file(mm, hpage, index, is_shmem, addr, file, nr, result); return result; } @@ -2077,6 +2124,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, present = 0; swap = 0; memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) @@ -2157,8 +2205,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } } - trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname, - present, swap, result); + trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result); return result; } #else @@ -2528,6 +2575,11 @@ void khugepaged_min_free_kbytes_update(void) mutex_unlock(&khugepaged_mutex); } +bool current_is_khugepaged(void) +{ + return kthread_func(current) == khugepaged; +} + static int madvise_collapse_errno(enum scan_result r) { /* @@ -2576,7 +2628,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, if (!cc) return -ENOMEM; cc->is_khugepaged = false; - cc->last_target_node = NUMA_NO_NODE; mmgrab(mm); lru_add_drain_all(); @@ -2602,6 +2653,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); + nodes_clear(cc->alloc_nmask); if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { struct file *file = get_file(vma->vm_file); pgoff_t pgoff = linear_page_index(vma, addr); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..92f670edbf51 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -79,6 +79,7 @@ #include <linux/mutex.h> #include <linux/rcupdate.h> #include <linux/stacktrace.h> +#include <linux/stackdepot.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/memblock.h> @@ -159,8 +160,7 @@ struct kmemleak_object { u32 checksum; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; - unsigned long trace[MAX_TRACE]; - unsigned int trace_len; + depot_stack_handle_t trace_handle; unsigned long jiffies; /* creation timestamp */ pid_t pid; /* pid of the current task */ char comm[TASK_COMM_LEN]; /* executable name */ @@ -346,19 +346,22 @@ static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; + unsigned long *entries; + unsigned int nr_entries; unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); + nr_entries = stack_depot_fetch(object->trace_handle, &entries); warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); + object->pointer, object->size); warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", - object->comm, object->pid, object->jiffies, - msecs_age / 1000, msecs_age % 1000); + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); hex_dump_object(seq, object); warn_or_seq_printf(seq, " backtrace:\n"); - for (i = 0; i < object->trace_len; i++) { - void *ptr = (void *)object->trace[i]; - warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); + for (i = 0; i < nr_entries; i++) { + void *ptr = (void *)entries[i]; + warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr); } } @@ -370,15 +373,16 @@ static void print_unreferenced(struct seq_file *seq, static void dump_object_info(struct kmemleak_object *object) { pr_notice("Object 0x%08lx (size %zu):\n", - object->pointer, object->size); + object->pointer, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); + object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); pr_notice(" flags = 0x%x\n", object->flags); pr_notice(" checksum = %u\n", object->checksum); pr_notice(" backtrace:\n"); - stack_trace_print(object->trace, object->trace_len, 4); + if (object->trace_handle) + stack_depot_print(object->trace_handle); } /* @@ -591,12 +595,18 @@ static struct kmemleak_object *find_and_remove_object(unsigned long ptr, int ali return object; } -/* - * Save stack trace to the given array of MAX_TRACE size. - */ -static int __save_stack_trace(unsigned long *trace) +static noinline depot_stack_handle_t set_track_prepare(void) { - return stack_trace_save(trace, MAX_TRACE, 2); + depot_stack_handle_t trace_handle; + unsigned long entries[MAX_TRACE]; + unsigned int nr_entries; + + if (!kmemleak_initialized) + return 0; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); + + return trace_handle; } /* @@ -653,7 +663,7 @@ static void __create_object(unsigned long ptr, size_t size, } /* kernel backtrace */ - object->trace_len = __save_stack_trace(object->trace); + object->trace_handle = set_track_prepare(); raw_spin_lock_irqsave(&kmemleak_lock, flags); @@ -692,7 +702,6 @@ static void __create_object(unsigned long ptr, size_t size, rb_link_node(&object->rb_node, rb_parent, link); rb_insert_color(&object->rb_node, is_phys ? &object_phys_tree_root : &object_tree_root); - list_add_tail_rcu(&object->object_list, &object_list); out: raw_spin_unlock_irqrestore(&kmemleak_lock, flags); @@ -1091,7 +1100,7 @@ void __ref kmemleak_update_trace(const void *ptr) } raw_spin_lock_irqsave(&object->lock, flags); - object->trace_len = __save_stack_trace(object->trace); + object->trace_handle = set_track_prepare(); raw_spin_unlock_irqrestore(&object->lock, flags); put_object(object); @@ -1461,6 +1470,27 @@ static void scan_gray_list(void) } /* + * Conditionally call resched() in an object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -1471,7 +1501,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1510,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1543,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,8 +1614,16 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. @@ -1632,8 +1656,16 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in * the next scan. @@ -2061,6 +2093,7 @@ void __init kmemleak_init(void) if (kmemleak_error) return; + stack_depot_init(); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 35f6b6e6a908..3807502766a3 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -260,6 +260,7 @@ void kmsan_handle_urb(const struct urb *urb, bool is_out) urb->transfer_buffer_length, /*checked*/ false); } +EXPORT_SYMBOL_GPL(kmsan_handle_urb); static void kmsan_handle_dma_page(const void *addr, size_t size, enum dma_data_direction dir) diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..770fe02904f3 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include <linux/gfp.h> +#include <linux/kmsan_string.h> #include <linux/mm.h> #include <linux/uaccess.h> @@ -80,12 +81,16 @@ DECLARE_METADATA_PTR_GETTER(8); * Handle a memory store performed by inline assembly. KMSAN conservatively * attempts to unpoison the outputs of asm() directives to prevent false * positives caused by missed stores. + * + * __msan_instrument_asm_store() may be called for inline assembly code when + * entering or leaving IRQ. We omit the check for kmsan_in_runtime() to ensure + * the memory written to in these cases is also marked as initialized. */ void __msan_instrument_asm_store(void *addr, uintptr_t size) { unsigned long ua_flags; - if (!kmsan_enabled || kmsan_in_runtime()) + if (!kmsan_enabled) return; ua_flags = user_access_save(); @@ -102,10 +107,8 @@ void __msan_instrument_asm_store(void *addr, uintptr_t size) user_access_restore(ua_flags); return; } - kmsan_enter_runtime(); /* Unpoisoning the memory on best effort. */ kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); - kmsan_leave_runtime(); user_access_restore(ua_flags); } EXPORT_SYMBOL(__msan_instrument_asm_store); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 7019c46d33a7..a14744205435 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -124,6 +124,8 @@ static __always_inline bool kmsan_in_runtime(void) { if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) return true; + if (in_nmi()) + return true; return kmsan_get_context()->kmsan_in_runtime; } diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 9a29ea2dbfb9..088e21a48dc4 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -22,6 +22,7 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/tracepoint.h> +#include <linux/vmalloc.h> #include <trace/events/printk.h> static DEFINE_PER_CPU(int, per_cpu_var); @@ -419,6 +420,7 @@ static void test_memcpy_aligned_to_aligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)&dst, sizeof(dst)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); @@ -441,6 +443,7 @@ static void test_memcpy_aligned_to_unaligned(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)dst, 4); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); @@ -464,6 +467,7 @@ static void test_memcpy_aligned_to_unaligned2(struct kunit *test) kunit_info( test, "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + OPTIMIZER_HIDE_VAR(uninit_src); memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); KUNIT_EXPECT_TRUE(test, report_matches(&expect)); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/pagewalk.h> #include <asm/tlbflush.h> #include "internal.h" @@ -419,47 +420,74 @@ static inline bool ksm_test_exit(struct mm_struct *mm) return atomic_read(&mm->mm_users) == 0; } +static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct page *page = NULL; + spinlock_t *ptl; + pte_t *pte; + int ret; + + if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + return 0; + + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (pte_present(*pte)) { + page = vm_normal_page(walk->vma, addr, *pte); + } else if (!pte_none(*pte)) { + swp_entry_t entry = pte_to_swp_entry(*pte); + + /* + * As KSM pages remain KSM pages until freed, no need to wait + * here for migration to end. + */ + if (is_migration_entry(entry)) + page = pfn_swap_entry_to_page(entry); + } + ret = page && PageKsm(page); + pte_unmap_unlock(pte, ptl); + return ret; +} + +static const struct mm_walk_ops break_ksm_ops = { + .pmd_entry = break_ksm_pmd_entry, +}; + /* - * We use break_ksm to break COW on a ksm page: it's a stripped down - * - * if (get_user_pages(addr, 1, FOLL_WRITE, &page, NULL) == 1) - * put_page(page); + * We use break_ksm to break COW on a ksm page by triggering unsharing, + * such that the ksm page will get replaced by an exclusive anonymous page. * - * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * We take great care only to touch a ksm page, in a VM_MERGEABLE vma, * in case the application has unmapped and remapped mm,addr meanwhile. * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP * mmap of /dev/mem, where we would not want to touch it. * - * FAULT_FLAG/FOLL_REMOTE are because we do this outside the context + * FAULT_FLAG_REMOTE/FOLL_REMOTE are because we do this outside the context * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; vm_fault_t ret = 0; do { + int ksm_page; + cond_resched(); - page = follow_page(vma, addr, - FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page)) - break; - if (PageKsm(page)) - ret = handle_mm_fault(vma, addr, - FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE, - NULL); - else - ret = VM_FAULT_WRITE; - put_page(page); - } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); + ksm_page = walk_page_range_vma(vma, addr, addr + 1, + &break_ksm_ops, NULL); + if (WARN_ON_ONCE(ksm_page < 0)) + return ksm_page; + if (!ksm_page) + return 0; + ret = handle_mm_fault(vma, addr, + FAULT_FLAG_UNSHARE | FAULT_FLAG_REMOTE, + NULL); + } while (!(ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); /* - * We must loop because handle_mm_fault() may back out if there's - * any difficulty e.g. if pte accessed bit gets updated concurrently. - * - * VM_FAULT_WRITE is what we have been hoping for: it indicates that - * COW has been broken, even if the vma does not permit VM_WRITE; - * but note that a concurrent fault might break PageKsm for us. + * We must loop until we no longer find a KSM page because + * handle_mm_fault() may back out if there's any difficulty e.g. if + * pte accessed bit gets updated concurrently. * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's @@ -1041,7 +1069,6 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, anon_exclusive = PageAnonExclusive(page); if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || - (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || anon_exclusive || mm_tlb_flush_pending(mm)) { pte_t entry; @@ -1079,11 +1106,11 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (pte_dirty(entry)) set_page_dirty(page); + entry = pte_mkclean(entry); + + if (pte_write(entry)) + entry = pte_wrprotect(entry); - if (pte_protnone(entry)) - entry = pte_mkclean(pte_clear_savedwrite(entry)); - else - entry = pte_mkclean(pte_wrprotect(entry)); set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; @@ -3211,7 +3238,7 @@ static int __init ksm_init(void) #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ - hotplug_memory_notifier(ksm_memory_callback, 100); + hotplug_memory_notifier(ksm_memory_callback, KSM_CALLBACK_PRI); #endif return 0; diff --git a/mm/maccess.c b/mm/maccess.c index 5f4d240f67ec..074f6b086671 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -97,7 +97,7 @@ long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) return src - unsafe_addr; Efault: pagefault_enable(); - dst[-1] = '\0'; + dst[0] = '\0'; return -EFAULT; } diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..a56a6d17e201 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -95,9 +95,6 @@ struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - if (vma->vm_file) - return NULL; - return vma->anon_name; } @@ -183,7 +180,7 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; - if (!vma->vm_file) { + if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); if (error) return error; @@ -226,6 +223,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, put_page(page); } swap_read_unplug(splug); + cond_resched(); return 0; } @@ -321,6 +319,21 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } +static inline bool can_do_file_pageout(struct vm_area_struct *vma) +{ + if (!vma->vm_file) + return false; + /* + * paging out pagecache only for non-anonymous mappings that correspond + * to the files the calling process could (if tried) open for writing; + * otherwise we'd be including shared non-exclusive mappings, which + * opens a side channel. + */ + return inode_owner_or_capable(&init_user_ns, + file_inode(vma->vm_file)) || + file_permission(vma->vm_file, MAY_WRITE) == 0; +} + static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -334,10 +347,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, spinlock_t *ptl; struct page *page = NULL; LIST_HEAD(page_list); + bool pageout_anon_only_filter; if (fatal_signal_pending(current)) return -EINTR; + pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && + !can_do_file_pageout(vma); + #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_trans_huge(*pmd)) { pmd_t orig_pmd; @@ -364,6 +381,9 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, if (page_mapcount(page) != 1) goto huge_unlock; + if (pageout_anon_only_filter && !PageAnon(page)) + goto huge_unlock; + if (next - addr != HPAGE_PMD_SIZE) { int err; @@ -432,6 +452,8 @@ regular_page: if (PageTransCompound(page)) { if (page_mapcount(page) != 1) break; + if (pageout_anon_only_filter && !PageAnon(page)) + break; get_page(page); if (!trylock_page(page)) { put_page(page); @@ -459,6 +481,9 @@ regular_page: if (!PageLRU(page) || page_mapcount(page) != 1) continue; + if (pageout_anon_only_filter && !PageAnon(page)) + continue; + VM_BUG_ON_PAGE(PageTransCompound(page), page); if (pte_young(ptent)) { @@ -553,23 +578,6 @@ static void madvise_pageout_page_range(struct mmu_gather *tlb, tlb_end_vma(tlb, vma); } -static inline bool can_do_pageout(struct vm_area_struct *vma) -{ - if (vma_is_anonymous(vma)) - return true; - if (!vma->vm_file) - return false; - /* - * paging out pagecache only for non-anonymous mappings that correspond - * to the files the calling process could (if tried) open for writing; - * otherwise we'd be including shared non-exclusive mappings, which - * opens a side channel. - */ - return inode_owner_or_capable(&init_user_ns, - file_inode(vma->vm_file)) || - file_permission(vma->vm_file, MAY_WRITE) == 0; -} - static long madvise_pageout(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start_addr, unsigned long end_addr) @@ -581,7 +589,14 @@ static long madvise_pageout(struct vm_area_struct *vma, if (!can_madv_lru_vma(vma)) return -EINVAL; - if (!can_do_pageout(vma)) + /* + * If the VMA belongs to a private file mapping, there can be private + * dirty pages which can be paged out if even this process is neither + * owner nor write capable of the file. We allow private file mappings + * further to pageout dirty anon pages. + */ + if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && + (vma->vm_flags & VM_MAYSHARE))) return 0; lru_add_drain(); @@ -772,8 +787,8 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, * Application no longer needs these pages. If the pages are dirty, * it's OK to just throw them away. The app will be more careful about * data it wants to keep. Be sure to free swap resources too. The - * zap_page_range call sets things up for shrink_active_list to actually free - * these pages later if no one else has touched them in the meantime, + * zap_page_range_single call sets things up for shrink_active_list to actually + * free these pages later if no one else has touched them in the meantime, * although we could add these pages to a global reuse list for * shrink_active_list to pick up before reclaiming other pages. * @@ -790,7 +805,7 @@ static int madvise_free_single_vma(struct vm_area_struct *vma, static long madvise_dontneed_single_vma(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - zap_page_range(vma, start, end - start); + zap_page_range_single(vma, start, end - start, NULL); return 0; } @@ -813,7 +828,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +850,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ @@ -1263,7 +1288,7 @@ static int madvise_vma_anon_name(struct vm_area_struct *vma, int error; /* Only anonymous mappings can be named */ - if (vma->vm_file) + if (vma->vm_file && !vma_is_anon_shmem(vma)) return -EBADF; error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, @@ -1449,7 +1474,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto out; } - ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); if (ret < 0) goto out; diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index 1b0ab8fcfd8b..175e424b9ab1 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); if (!pmd_trans_unstable(&pmdval)) return 0; diff --git a/mm/memblock.c b/mm/memblock.c index 511d4783dcf1..d036c7861310 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -836,7 +836,7 @@ void __init_memblock memblock_free(void *ptr, size_t size) * @base: phys starting address of the boot memory block * @size: size of the boot memory block in bytes * - * Free boot memory block previously allocated by memblock_alloc_xx() API. + * Free boot memory block previously allocated by memblock_phys_alloc_xx() API. * The freeing memory will not be released to the buddy allocator. */ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d8549ae1b30..ab457f0394ab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include <linux/resume_user_mode.h> #include <linux/psi.h> #include <linux/seq_buf.h> +#include <linux/parser.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -661,8 +662,10 @@ static const unsigned int memcg_vm_event_stat[] = { PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, + PGSCAN_KHUGEPAGED, PGSTEAL_KSWAPD, PGSTEAL_DIRECT, + PGSTEAL_KHUGEPAGED, PGFAULT, PGMAJFAULT, PGREFILL, @@ -1219,7 +1222,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. */ - if (last != root_mem_cgroup) + if (!mem_cgroup_is_root(last)) __invalidate_reclaim_iterators(root_mem_cgroup, dead_memcg); } @@ -1243,7 +1246,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct mem_cgroup *iter; int ret = 0; - BUG_ON(memcg == root_mem_cgroup); + BUG_ON(mem_cgroup_is_root(memcg)); for_each_mem_cgroup_tree(iter, memcg) { struct css_task_iter it; @@ -1272,7 +1275,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) memcg = folio_memcg(folio); if (!memcg) - VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio); + VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio); else VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio); } @@ -1574,10 +1577,12 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) /* Accumulated memory events */ seq_buf_printf(&s, "pgscan %lu\n", memcg_events(memcg, PGSCAN_KSWAPD) + - memcg_events(memcg, PGSCAN_DIRECT)); + memcg_events(memcg, PGSCAN_DIRECT) + + memcg_events(memcg, PGSCAN_KHUGEPAGED)); seq_buf_printf(&s, "pgsteal %lu\n", memcg_events(memcg, PGSTEAL_KSWAPD) + - memcg_events(memcg, PGSTEAL_DIRECT)); + memcg_events(memcg, PGSTEAL_DIRECT) + + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { if (memcg_vm_event_stat[i] == PGPGIN || @@ -2036,7 +2041,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, rcu_read_lock(); memcg = mem_cgroup_from_task(victim); - if (memcg == root_mem_cgroup) + if (mem_cgroup_is_root(memcg)) goto out; /* @@ -2388,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, psi_memstall_enter(&pflags); nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, - MEMCG_RECLAIM_MAY_SWAP); + MEMCG_RECLAIM_MAY_SWAP, + NULL); psi_memstall_leave(&pflags); } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); @@ -2679,7 +2685,8 @@ retry: psi_memstall_enter(&pflags); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, - gfp_mask, reclaim_options); + gfp_mask, reclaim_options, + NULL); psi_memstall_leave(&pflags); if (mem_cgroup_margin(mem_over_limit) >= nr_pages) @@ -2995,7 +3002,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg) { struct obj_cgroup *objcg = NULL; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { objcg = rcu_dereference(memcg->objcg); if (objcg && obj_cgroup_tryget(objcg)) break; @@ -3026,7 +3033,7 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) { struct obj_cgroup *objcg; - if (!memcg_kmem_enabled() || memcg_kmem_bypass()) + if (!memcg_kmem_enabled()) return NULL; if (PageMemcgKmem(page)) { @@ -3499,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { + memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, + NULL)) { ret = -EBUSY; break; } @@ -3610,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return -EINTR; if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, - MEMCG_RECLAIM_MAY_SWAP)) + MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_retries--; } @@ -4832,6 +4841,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, unsigned int efd, cfd; struct fd efile; struct fd cfile; + struct dentry *cdentry; const char *name; char *endp; int ret; @@ -4886,6 +4896,16 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, goto out_put_cfile; /* + * The control file must be a regular cgroup1 file. As a regular cgroup + * file can't be renamed, it's safe to access its name afterwards. + */ + cdentry = cfile.file->f_path.dentry; + if (cdentry->d_sb->s_type != &cgroup_fs_type || !d_is_reg(cdentry)) { + ret = -EINVAL; + goto out_put_cfile; + } + + /* * Determine the event callbacks and set them in @event. This used * to be done via struct cftype but cgroup core no longer knows * about these events. The following is crude but the whole thing @@ -4893,7 +4913,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * * DO NOT ADD NEW FILES. */ - name = cfile.file->f_path.dentry->d_name.name; + name = cdentry->d_name.name; if (!strcmp(name, "memory.usage_in_bytes")) { event->register_event = mem_cgroup_usage_register_event; @@ -4917,7 +4937,7 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, * automatically removed on cgroup destruction but the removal is * asynchronous, so take an extra ref on @css. */ - cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent, + cfile_css = css_tryget_online_from_dir(cdentry->d_parent, &memory_cgrp_subsys); ret = -EINVAL; if (IS_ERR(cfile_css)) @@ -5648,15 +5668,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, static struct page *mc_handle_file_pte(struct vm_area_struct *vma, unsigned long addr, pte_t ptent) { + unsigned long index; + struct folio *folio; + if (!vma->vm_file) /* anonymous vma */ return NULL; if (!(mc.flags & MOVE_FILE)) return NULL; - /* page is moved even if it's not RSS of this task(page-faulted). */ + /* folio is moved even if it's not RSS of this task(page-faulted). */ /* shmem/tmpfs may report page out on swap: account for that too. */ - return find_get_incore_page(vma->vm_file->f_mapping, - linear_page_index(vma, addr)); + index = linear_page_index(vma, addr); + folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); + if (!folio) + return NULL; + return folio_file_page(folio, index); } /** @@ -5741,6 +5767,12 @@ static int mem_cgroup_move_account(struct page *page, } } +#ifdef CONFIG_SWAP + if (folio_test_swapcache(folio)) { + __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); + __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); + } +#endif if (folio_test_writeback(folio)) { __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); @@ -6397,7 +6429,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, } reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL); if (!reclaimed && !nr_retries--) break; @@ -6446,7 +6479,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (nr_reclaims) { if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, - GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) + GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, + NULL)) nr_reclaims--; continue; } @@ -6569,21 +6603,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; } +enum { + MEMORY_RECLAIM_NODES = 0, + MEMORY_RECLAIM_NULL, +}; + +static const match_table_t if_tokens = { + { MEMORY_RECLAIM_NODES, "nodes=%s" }, + { MEMORY_RECLAIM_NULL, NULL }, +}; + static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned long nr_to_reclaim, nr_reclaimed = 0; - unsigned int reclaim_options; - int err; + unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | + MEMCG_RECLAIM_PROACTIVE; + char *old_buf, *start; + substring_t args[MAX_OPT_ARGS]; + int token; + char value[256]; + nodemask_t nodemask = NODE_MASK_ALL; buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; - reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; + old_buf = buf; + nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE; + if (buf == old_buf) + return -EINVAL; + + buf = strstrip(buf); + + while ((start = strsep(&buf, " ")) != NULL) { + if (!strlen(start)) + continue; + token = match_token(start, if_tokens, args); + match_strlcpy(value, args, sizeof(value)); + switch (token) { + case MEMORY_RECLAIM_NODES: + if (nodelist_parse(value, nodemask) < 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + } + while (nr_reclaimed < nr_to_reclaim) { unsigned long reclaimed; @@ -6600,7 +6667,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options); + GFP_KERNEL, reclaim_options, + &nodemask); if (!reclaimed && !nr_retries--) return -EAGAIN; @@ -7163,7 +7231,7 @@ void mem_cgroup_sk_alloc(struct sock *sk) rcu_read_lock(); memcg = mem_cgroup_from_task(current); - if (memcg == root_mem_cgroup) + if (mem_cgroup_is_root(memcg)) goto out; if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active) goto out; @@ -7298,7 +7366,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) * The root cgroup cannot be destroyed, so it's refcount must * always be >= 1. */ - if (WARN_ON_ONCE(memcg == root_mem_cgroup)) { + if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) { VM_BUG_ON(1); break; } @@ -7462,7 +7530,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); @@ -7484,7 +7552,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || @@ -7648,7 +7716,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) return true; original_memcg = get_mem_cgroup_from_objcg(objcg); - for (memcg = original_memcg; memcg != root_mem_cgroup; + for (memcg = original_memcg; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) { unsigned long max = READ_ONCE(memcg->zswap_max); unsigned long pages; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 145bb561ddb3..c77a9e37e27e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -74,6 +74,19 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +inline void num_poisoned_pages_inc(unsigned long pfn) +{ + atomic_long_inc(&num_poisoned_pages); + memblk_nr_poison_inc(pfn); +} + +inline void num_poisoned_pages_sub(unsigned long pfn, long i) +{ + atomic_long_sub(i, &num_poisoned_pages); + if (pfn != -1UL) + memblk_nr_poison_sub(pfn, i); +} + /* * Return values: * 1: the page is dissolved (if needed) and taken off from buddy, @@ -115,7 +128,7 @@ static bool page_handle_poison(struct page *page, bool hugepage_or_freepage, boo if (release) put_page(page); page_ref_inc(page); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); return true; } @@ -827,12 +840,13 @@ static int truncate_error_page(struct page *p, unsigned long pfn, int ret = MF_FAILED; if (mapping->a_ops->error_remove_page) { + struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (page_has_private(p) && - !try_to_release_page(p, GFP_NOIO)) { + } else if (folio_has_private(folio) && + !filemap_release_folio(folio, GFP_NOIO)) { pr_info("%#lx: failed to release buffers\n", pfn); } else { ret = MF_RECOVERED; @@ -1080,6 +1094,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) int res; struct page *hpage = compound_head(p); struct address_space *mapping; + bool extra_pins = false; if (!PageHuge(hpage)) return MF_DELAYED; @@ -1087,6 +1102,8 @@ static int me_huge_page(struct page_state *ps, struct page *p) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); + /* The page is kept in page cache. */ + extra_pins = true; unlock_page(hpage); } else { unlock_page(hpage); @@ -1104,7 +1121,7 @@ static int me_huge_page(struct page_state *ps, struct page *p) } } - if (has_extra_refcount(ps, p, false)) + if (has_extra_refcount(ps, p, extra_pins)) res = MF_FAILED; return res; @@ -1179,14 +1196,16 @@ static struct page_state error_states[] = { * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, enum mf_action_page_type type, - enum mf_result result) +static int action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { trace_memory_failure_event(pfn, type, result); - num_poisoned_pages_inc(); + num_poisoned_pages_inc(pfn); pr_err("%#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); + + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } static int page_action(struct page_state *ps, struct page *p, @@ -1197,14 +1216,12 @@ static int page_action(struct page_state *ps, struct page *p, /* page p should be unlocked after returning from ps->action(). */ result = ps->action(ps, p); - action_result(pfn, ps->type, result); - /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; + return action_result(pfn, ps->type, result); } static inline bool PageHWPoisonTakenOff(struct page *page) @@ -1244,7 +1261,7 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, false); if (hugetlb) return ret; @@ -1334,7 +1351,7 @@ static int __get_unpoison_page(struct page *page) int ret = 0; bool hugetlb = false; - ret = get_hwpoison_huge_page(head, &hugetlb); + ret = get_hwpoison_huge_page(head, &hugetlb, true); if (hugetlb) return ret; @@ -1671,8 +1688,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #ifdef CONFIG_HUGETLB_PAGE /* * Struct raw_hwp_page represents information about "raw error page", - * constructing singly linked list originated from ->private field of - * SUBPAGE_INDEX_HWPOISON-th tail page. + * constructing singly linked list from ->_hugetlb_hwpoison field of folio. */ struct raw_hwp_page { struct llist_node node; @@ -1681,7 +1697,7 @@ struct raw_hwp_page { static inline struct llist_head *raw_hwp_list_head(struct page *hpage) { - return (struct llist_head *)&page_private(hpage + SUBPAGE_INDEX_HWPOISON); + return (struct llist_head *)&page_folio(hpage)->_hugetlb_hwpoison; } static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) @@ -1696,6 +1712,8 @@ static unsigned long __free_raw_hwp_pages(struct page *hpage, bool move_flag) if (move_flag) SetPageHWPoison(p->page); + else + num_poisoned_pages_sub(page_to_pfn(p->page), 1); kfree(p); count++; } @@ -1731,7 +1749,7 @@ static int hugetlb_set_page_hwpoison(struct page *hpage, struct page *page) llist_add(&raw_hwp->node, head); /* the first error event will be counted in action_result(). */ if (ret) - num_poisoned_pages_inc(); + num_poisoned_pages_inc(page_to_pfn(page)); } else { /* * Failed to save raw error info. We no longer trace all @@ -1785,7 +1803,8 @@ void hugetlb_clear_page_hwpoison(struct page *hpage) * -EBUSY - the hugepage is busy (try to retry) * -EHWPOISON - the hugepage is already hwpoisoned */ -int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) +int __get_huge_page_for_hwpoison(unsigned long pfn, int flags, + bool *migratable_cleared) { struct page *page = pfn_to_page(pfn); struct page *head = compound_head(page); @@ -1815,6 +1834,15 @@ int __get_huge_page_for_hwpoison(unsigned long pfn, int flags) goto out; } + /* + * Clearing HPageMigratable for hwpoisoned hugepages to prevent them + * from being migrated by memory hotremove. + */ + if (count_increased && HPageMigratable(head)) { + ClearHPageMigratable(head); + *migratable_cleared = true; + } + return ret; out: if (count_increased) @@ -1834,10 +1862,11 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb struct page *p = pfn_to_page(pfn); struct page *head; unsigned long page_flags; + bool migratable_cleared = false; *hugetlb = 1; retry: - res = get_huge_page_for_hwpoison(pfn, flags); + res = get_huge_page_for_hwpoison(pfn, flags, &migratable_cleared); if (res == 2) { /* fallback to normal page handling */ *hugetlb = 0; return 0; @@ -1853,8 +1882,7 @@ retry: flags |= MF_NO_RETRY; goto retry; } - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - return res; + return action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); } head = compound_head(p); @@ -1862,6 +1890,8 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); + if (migratable_cleared) + SetHPageMigratable(head); unlock_page(head); if (res == 1) put_page(head); @@ -1880,22 +1910,17 @@ retry: } else { res = MF_FAILED; } - action_result(pfn, MF_MSG_FREE_HUGE, res); - return res == MF_RECOVERED ? 0 : -EBUSY; + return action_result(pfn, MF_MSG_FREE_HUGE, res); } page_flags = head->flags; if (!hwpoison_user_mappings(p, pfn, flags, head)) { - action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); - res = -EBUSY; - goto out; + unlock_page(head); + return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); } return identify_page_state(pfn, p, page_flags); -out: - unlock_page(head); - return res; } #else @@ -1910,17 +1935,25 @@ static inline unsigned long free_raw_hwp_pages(struct page *hpage, bool flag) } #endif /* CONFIG_HUGETLB_PAGE */ +/* Drop the extra refcount in case we come from madvise() */ +static void put_ref_page(unsigned long pfn, int flags) +{ + struct page *page; + + if (!(flags & MF_COUNT_INCREASED)) + return; + + page = pfn_to_page(pfn); + if (page) + put_page(page); +} + static int memory_failure_dev_pagemap(unsigned long pfn, int flags, struct dev_pagemap *pgmap) { - struct page *page = pfn_to_page(pfn); int rc = -ENXIO; - if (flags & MF_COUNT_INCREASED) - /* - * Drop the extra refcount in case we come from madvise(). - */ - put_page(page); + put_ref_page(pfn, flags); /* device metadata space is not recoverable */ if (!pgmap_pfn_valid(pgmap, pfn)) @@ -2052,16 +2085,13 @@ try_again: } res = MF_FAILED; } - action_result(pfn, MF_MSG_BUDDY, res); - res = res == MF_RECOVERED ? 0 : -EBUSY; + res = action_result(pfn, MF_MSG_BUDDY, res); } else { - action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); } goto unlock_mutex; } else if (res < 0) { - action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED); goto unlock_mutex; } } @@ -2082,8 +2112,7 @@ try_again: */ SetPageHasHWPoisoned(hpage); if (try_to_split_thp_page(p) < 0) { - action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); @@ -2116,8 +2145,7 @@ try_again: retry = false; goto try_again; } - action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); goto unlock_page; } @@ -2157,8 +2185,7 @@ try_again: * Abort on fail: __filemap_remove_folio() assumes unmapped page. */ if (!hwpoison_user_mappings(p, pfn, flags, p)) { - action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); goto unlock_page; } @@ -2166,8 +2193,7 @@ try_again: * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); - res = -EBUSY; + res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); goto unlock_page; } @@ -2312,8 +2338,8 @@ int unpoison_memory(unsigned long pfn) struct page *page; struct page *p; int ret = -EBUSY; - int freeit = 0; unsigned long count = 1; + bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -2362,6 +2388,7 @@ int unpoison_memory(unsigned long pfn) ret = get_hwpoison_page(p, MF_UNPOISON); if (!ret) { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2377,6 +2404,7 @@ int unpoison_memory(unsigned long pfn) pfn, &unpoison_rs); } else { if (PageHuge(p)) { + huge = true; count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; @@ -2384,10 +2412,9 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } } - freeit = !!TestClearPageHWPoison(p); put_page(page); - if (freeit) { + if (TestClearPageHWPoison(p)) { put_page(page); ret = 0; } @@ -2395,8 +2422,9 @@ int unpoison_memory(unsigned long pfn) unlock_mutex: mutex_unlock(&mf_mutex); - if (!ret || freeit) { - num_poisoned_pages_sub(count); + if (!ret) { + if (!huge) + num_poisoned_pages_sub(pfn, 1); unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", page_to_pfn(p), &unpoison_rs); } @@ -2513,12 +2541,6 @@ static int soft_offline_in_use_page(struct page *page) return ret; } -static void put_ref_page(struct page *page) -{ - if (page) - put_page(page); -} - /** * soft_offline_page - Soft offline a page. * @pfn: pfn to soft-offline @@ -2547,19 +2569,17 @@ int soft_offline_page(unsigned long pfn, int flags) { int ret; bool try_again = true; - struct page *page, *ref_page = NULL; - - WARN_ON_ONCE(!pfn_valid(pfn) && (flags & MF_COUNT_INCREASED)); + struct page *page; - if (!pfn_valid(pfn)) + if (!pfn_valid(pfn)) { + WARN_ON_ONCE(flags & MF_COUNT_INCREASED); return -ENXIO; - if (flags & MF_COUNT_INCREASED) - ref_page = pfn_to_page(pfn); + } /* Only online pages can be soft-offlined (esp., not ZONE_DEVICE). */ page = pfn_to_online_page(pfn); if (!page) { - put_ref_page(ref_page); + put_ref_page(pfn, flags); return -EIO; } @@ -2567,7 +2587,7 @@ int soft_offline_page(unsigned long pfn, int flags) if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); - put_ref_page(ref_page); + put_ref_page(pfn, flags); mutex_unlock(&mf_mutex); return 0; } @@ -2599,26 +2619,3 @@ retry: return ret; } - -void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) -{ - int i, total = 0; - - /* - * A further optimization is to have per section refcounted - * num_poisoned_pages. But that would need more space per memmap, so - * for now just do a quick global check to speed up this routine in the - * absence of bad pages. - */ - if (atomic_long_read(&num_poisoned_pages) == 0) - return; - - for (i = 0; i < nr_pages; i++) { - if (PageHWPoison(&memmap[i])) { - total++; - ClearPageHWPoison(&memmap[i]); - } - } - if (total) - num_poisoned_pages_sub(total); -} diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..c734658c6242 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; @@ -645,7 +645,7 @@ static int __init memory_tier_init(void) * than default DRAM tier. */ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM); - if (!default_dram_type) + if (IS_ERR(default_dram_type)) panic("%s() failed to allocate default DRAM tier\n", __func__); /* @@ -664,7 +664,7 @@ static int __init memory_tier_init(void) establish_demotion_targets(); mutex_unlock(&memory_tier_lock); - hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRIO); + hotplug_memory_notifier(memtier_hotplug_callback, MEMTIER_HOTPLUG_PRI); return 0; } subsys_initcall(memory_tier_init); diff --git a/mm/memory.c b/mm/memory.c index f88c351aecd4..aad226daf41b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -162,58 +162,11 @@ static int __init init_zero_pfn(void) } early_initcall(init_zero_pfn); -void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) +void mm_trace_rss_stat(struct mm_struct *mm, int member) { - trace_rss_stat(mm, member, count); + trace_rss_stat(mm, member); } -#if defined(SPLIT_RSS_COUNTING) - -void sync_mm_rss(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - if (current->rss_stat.count[i]) { - add_mm_counter(mm, i, current->rss_stat.count[i]); - current->rss_stat.count[i] = 0; - } - } - current->rss_stat.events = 0; -} - -static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) -{ - struct task_struct *task = current; - - if (likely(task->mm == mm)) - task->rss_stat.count[member] += val; - else - add_mm_counter(mm, member, val); -} -#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) -#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) - -/* sync counter once per 64 page faults */ -#define TASK_RSS_EVENTS_THRESH (64) -static void check_sync_rss_stat(struct task_struct *task) -{ - if (unlikely(task != current)) - return; - if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) - sync_mm_rss(task->mm); -} -#else /* SPLIT_RSS_COUNTING */ - -#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) -#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) - -static void check_sync_rss_stat(struct task_struct *task) -{ -} - -#endif /* SPLIT_RSS_COUNTING */ - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -1341,15 +1294,6 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) return ret; } -/* - * Parameter block passed down to zap_pte_range in exceptional cases. - */ -struct zap_details { - struct folio *single_folio; /* Locked folio to be unmapped */ - bool even_cows; /* Zap COWed private pages too? */ - zap_flags_t zap_flags; /* Extra flags for zapping */ -}; - /* Whether we should zap all COWed (private) pages too */ static inline bool should_zap_cows(struct zap_details *details) { @@ -1393,12 +1337,10 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { -#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); -#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -1432,6 +1374,8 @@ again: break; if (pte_present(ptent)) { + unsigned int delay_rmap; + page = vm_normal_page(vma, addr, ptent); if (unlikely(!should_zap_page(details, page))) continue; @@ -1443,20 +1387,26 @@ again: if (unlikely(!page)) continue; + delay_rmap = 0; if (!PageAnon(page)) { if (pte_dirty(ptent)) { - force_flush = 1; set_page_dirty(page); + if (tlb_delay_rmap(tlb)) { + delay_rmap = 1; + force_flush = 1; + } } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; - page_remove_rmap(page, vma, false); - if (unlikely(page_mapcount(page) < 0)) - print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (!delay_rmap) { + page_remove_rmap(page, vma, false); + if (unlikely(page_mapcount(page) < 0)) + print_bad_pte(vma, addr, ptent, page); + } + if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) { force_flush = 1; addr += PAGE_SIZE; break; @@ -1513,8 +1463,10 @@ again: arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ - if (force_flush) + if (force_flush) { tlb_flush_mmu_tlbonly(tlb); + tlb_flush_rmaps(tlb, vma); + } pte_unmap_unlock(start_pte, ptl); /* @@ -1720,7 +1672,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, { struct mmu_notifier_range range; struct zap_details details = { - .zap_flags = ZAP_FLAG_DROP_MARKER, + .zap_flags = ZAP_FLAG_DROP_MARKER | ZAP_FLAG_UNMAP, /* Careful - we need to zap private pages too! */ .even_cows = true, }; @@ -1774,19 +1726,27 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, * * The range must fit into one VMA. */ -static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, +void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { + const unsigned long end = address + size; struct mmu_notifier_range range; struct mmu_gather tlb; lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - address, address + size); + address, end); + if (is_vm_hugetlb_page(vma)) + adjust_range_if_pmd_sharing_possible(vma, &range.start, + &range.end); tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - unmap_single_vma(&tlb, vma, address, range.end, details); + /* + * unmap 'address-end' not 'range.start-range.end' as range + * could have been expanded for hugetlb pmd sharing. + */ + unmap_single_vma(&tlb, vma, address, end, details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } @@ -1860,7 +1820,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot)); return 0; @@ -2848,10 +2808,16 @@ static inline int pte_unmap_same(struct vm_fault *vmf) return same; } -static inline bool __wp_page_copy_user(struct page *dst, struct page *src, - struct vm_fault *vmf) +/* + * Return: + * 0: copied succeeded + * -EHWPOISON: copy failed due to hwpoison in source page + * -EAGAIN: copied failed (some other reason) + */ +static inline int __wp_page_copy_user(struct page *dst, struct page *src, + struct vm_fault *vmf) { - bool ret; + int ret; void *kaddr; void __user *uaddr; bool locked = false; @@ -2860,8 +2826,11 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, unsigned long addr = vmf->address; if (likely(src)) { - copy_user_highpage(dst, src, addr, vma); - return true; + if (copy_mc_user_highpage(dst, src, addr, vma)) { + memory_failure_queue(page_to_pfn(src), 0); + return -EHWPOISON; + } + return 0; } /* @@ -2888,7 +2857,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, * and update local tlb only */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2913,7 +2882,7 @@ static inline bool __wp_page_copy_user(struct page *dst, struct page *src, if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ update_mmu_tlb(vma, addr, vmf->pte); - ret = false; + ret = -EAGAIN; goto pte_unlock; } @@ -2932,7 +2901,7 @@ warn: } } - ret = true; + ret = 0; pte_unlock: if (locked) @@ -3104,6 +3073,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_t entry; int page_copied = 0; struct mmu_notifier_range range; + int ret; delayacct_wpcopy_start(); @@ -3121,19 +3091,21 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (!new_page) goto oom; - if (!__wp_page_copy_user(new_page, old_page, vmf)) { + ret = __wp_page_copy_user(new_page, old_page, vmf); + if (ret) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. + * The -EHWPOISON case will not be retried. */ put_page(new_page); if (old_page) put_page(old_page); delayacct_wpcopy_end(); - return 0; + return ret == -EHWPOISON ? VM_FAULT_HWPOISON : 0; } kmsan_copy_page_meta(new_page, old_page); } @@ -3156,12 +3128,11 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter_fast(mm, - mm_counter_file(old_page)); - inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter(mm, mm_counter_file(old_page)); + inc_mm_counter(mm, MM_ANONPAGES); } } else { - inc_mm_counter_fast(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); @@ -3242,7 +3213,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } delayacct_wpcopy_end(); - return (page_copied && !unshare) ? VM_FAULT_WRITE : 0; + return 0; oom_free_new: put_page(new_page); oom: @@ -3306,14 +3277,14 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); - return VM_FAULT_WRITE; + return 0; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - vm_fault_t ret = VM_FAULT_WRITE; + vm_fault_t ret = 0; get_page(vmf->page); @@ -3370,10 +3341,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - - VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); - VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + struct folio *folio = NULL; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, *vmf->pte)) { @@ -3391,13 +3359,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) } vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); - if (!vmf->page) { - if (unlikely(unshare)) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + /* + * Shared mapping: we are guaranteed to have VM_WRITE and + * FAULT_FLAG_WRITE set at this point. + */ + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. @@ -3405,20 +3372,19 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ - if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED)) + if (!vmf->page) return wp_pfn_shared(vmf); - - pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf); + return wp_page_shared(vmf); } + if (vmf->page) + folio = page_folio(vmf->page); + /* - * Take out anonymous pages first, anonymous shared vmas are - * not dirty accountable. + * Private mapping: create an exclusive anonymous page copy if reuse + * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. */ - folio = page_folio(vmf->page); - if (folio_test_anon(folio)) { + if (folio && folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. @@ -3464,24 +3430,18 @@ reuse: return 0; } wp_page_reuse(vmf); - return VM_FAULT_WRITE; - } else if (unshare) { - /* No anonymous page -> nothing to do. */ - pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; - } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == - (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf); } copy: /* * Ok, we need to copy. Oh, well.. */ - get_page(vmf->page); + if (folio) + folio_get(folio); pte_unmap_unlock(vmf->pte, vmf->ptl); #ifdef CONFIG_KSM - if (PageKsm(vmf->page)) + if (folio && folio_test_ksm(folio)) count_vm_event(COW_KSM); #endif return wp_page_copy(vmf); @@ -3701,11 +3661,14 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) unsigned long marker = pte_marker_get(entry); /* - * PTE markers should always be with file-backed memories, and the - * marker should never be empty. If anything weird happened, the best - * thing to do is to kill the process along with its mm. + * PTE markers should never be empty. If anything weird happened, + * the best thing to do is to kill the process along with its mm. */ - if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker)) + if (WARN_ON_ONCE(!marker)) + return VM_FAULT_SIGBUS; + + /* Higher priority than uffd-wp when data corrupted */ + if (marker & PTE_MARKER_SWAPIN_ERROR) return VM_FAULT_SIGBUS; if (pte_marker_entry_uffd_wp(entry)) @@ -3763,12 +3726,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) */ get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); - vmf->page->pgmap->ops->migrate_to_ram(vmf); + ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; - } else if (is_swapin_error_entry(entry)) { - ret = VM_FAULT_SIGBUS; } else if (is_pte_marker_entry(entry)) { ret = handle_pte_marker(vmf); } else { @@ -3968,8 +3929,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (should_try_to_free_swap(folio, vma, vmf->flags)) folio_free_swap(folio); - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); /* @@ -3983,7 +3944,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; - ret |= VM_FAULT_WRITE; } rmap_flags |= RMAP_EXCLUSIVE; } @@ -4149,7 +4109,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_MISSING); } - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address); lru_cache_add_inactive_or_unevictable(page, vma); setpte: @@ -4339,11 +4299,11 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(pte_wrprotect(entry)); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, addr); lru_cache_add_inactive_or_unevictable(page, vma); } else { - inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); + inc_mm_counter(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, vma, false); } set_pte_at(vma->vm_mm, addr, vmf->pte, entry); @@ -4713,10 +4673,10 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = NUMA_NO_NODE; + bool writable = false; int last_cpupid; int target_nid; pte_t pte, old_pte; - bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; /* @@ -4735,6 +4695,15 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) old_pte = ptep_get(vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); + /* + * Detect now whether the PTE could be writable; this information + * is only valid while holding the PT lock. + */ + writable = pte_write(pte); + if (!writable && vma_wants_manual_pte_write_upgrade(vma) && + can_change_pte_writable(vma, vmf->address, pte)) + writable = true; + page = vm_normal_page(vma, vmf->address, pte); if (!page || is_zone_device_page(page)) goto out_map; @@ -4751,7 +4720,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * pte_dirty has unpredictable behaviour between PTE scan updates, * background writeback, dirty balancing and application behaviour. */ - if (!was_writable) + if (!writable) flags |= TNF_NO_GROUP; /* @@ -4778,6 +4747,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) goto out_map; } pte_unmap_unlock(vmf->pte, vmf->ptl); + writable = false; /* Migrate to the requested node */ if (migrate_misplaced_page(page, vma, target_nid)) { @@ -4806,7 +4776,7 @@ out_map: old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); - if (was_writable) + if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); @@ -4827,6 +4797,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; + vm_fault_t ret; if (vma_is_anonymous(vmf->vma)) { if (likely(!unshare) && @@ -4834,11 +4805,13 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } /* COW or write-notify handled on pte level: split pmd. */ @@ -4864,14 +4837,17 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + vm_fault_t ret; + /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vmf->vma)) goto split; - if (vmf->vma->vm_ops->huge_fault) { - vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); - - if (!(ret & VM_FAULT_FALLBACK)) - return ret; + if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vmf->vma->vm_ops->huge_fault) { + ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + } } split: /* COW or write-notify not handled on PUD level: split pud.*/ @@ -5179,6 +5155,30 @@ static void lru_gen_exit_fault(void) } #endif /* CONFIG_LRU_GEN */ +static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, + unsigned int *flags) +{ + if (unlikely(*flags & FAULT_FLAG_UNSHARE)) { + if (WARN_ON_ONCE(*flags & FAULT_FLAG_WRITE)) + return VM_FAULT_SIGSEGV; + /* + * FAULT_FLAG_UNSHARE only applies to COW mappings. Let's + * just treat it like an ordinary read-fault otherwise. + */ + if (!is_cow_mapping(vma->vm_flags)) + *flags &= ~FAULT_FLAG_UNSHARE; + } else if (*flags & FAULT_FLAG_WRITE) { + /* Write faults on read-only mappings are impossible ... */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE))) + return VM_FAULT_SIGSEGV; + /* ... and FOLL_FORCE only applies to COW mappings. */ + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE) && + !is_cow_mapping(vma->vm_flags))) + return VM_FAULT_SIGSEGV; + } + return 0; +} + /* * By the time we get here, we already hold the mm semaphore * @@ -5195,8 +5195,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, count_vm_event(PGFAULT); count_memcg_event_mm(vma->vm_mm, PGFAULT); - /* do counter updates before entering really critical section. */ - check_sync_rss_stat(current); + ret = sanitize_fault_flags(vma, &flags); + if (ret) + return ret; if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, flags & FAULT_FLAG_INSTRUCTION, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a937eaec5b68..02c8a712282f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -787,17 +787,22 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - prev = mas_find_rev(&mas, 0); - if (prev && (start < prev->vm_end)) - vma = prev; - else - vma = mas_next(&mas, end - 1); + prev = mas_prev(&mas, 0); + if (unlikely(!prev)) + mas_set(&mas, start); + + vma = mas_find(&mas, end - 1); + if (WARN_ON(!vma)) + return 0; + + if (start > vma->vm_start) + prev = vma; for (; vma; vma = mas_next(&mas, end - 1)) { unsigned long vmstart = max(start, vma->vm_start); @@ -1535,6 +1540,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le * the home node for vmas we already updated before. */ if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + mpol_put(new); err = -EOPNOTSUPP; break; } diff --git a/mm/mempool.c b/mm/mempool.c index 96488b13a1ef..734bcf5afbb7 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -57,8 +57,10 @@ static void __check_element(mempool_t *pool, void *element, size_t size) static void check_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->free == mempool_free_slab || pool->free == mempool_kfree) { - __check_element(pool, element, ksize(element)); + if (pool->free == mempool_kfree) { + __check_element(pool, element, (size_t)pool->pool_data); + } else if (pool->free == mempool_free_slab) { + __check_element(pool, element, kmem_cache_size(pool->pool_data)); } else if (pool->free == mempool_free_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; @@ -80,8 +82,10 @@ static void __poison_element(void *element, size_t size) static void poison_element(mempool_t *pool, void *element) { /* Mempools backed by slab allocator */ - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) { - __poison_element(element, ksize(element)); + if (pool->alloc == mempool_kmalloc) { + __poison_element(element, (size_t)pool->pool_data); + } else if (pool->alloc == mempool_alloc_slab) { + __poison_element(element, kmem_cache_size(pool->pool_data)); } else if (pool->alloc == mempool_alloc_pages) { /* Mempools backed by page allocator */ int order = (int)(long)pool->pool_data; @@ -111,8 +115,10 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) static void kasan_unpoison_element(mempool_t *pool, void *element) { - if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) - kasan_unpoison_range(element, __ksize(element)); + if (pool->alloc == mempool_kmalloc) + kasan_unpoison_range(element, (size_t)pool->pool_data); + else if (pool->alloc == mempool_alloc_slab) + kasan_unpoison_range(element, kmem_cache_size(pool->pool_data)); else if (pool->alloc == mempool_alloc_pages) kasan_unpoison_pages(element, (unsigned long)pool->pool_data, false); diff --git a/mm/memremap.c b/mm/memremap.c index 421bec3a29ee..08cbf54fe037 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -335,6 +335,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) WARN(1, "File system DAX not supported\n"); return ERR_PTR(-EINVAL); } + params.pgprot = pgprot_decrypted(params.pgprot); break; case MEMORY_DEVICE_GENERIC: break; diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..a4d3fc65085f 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -74,13 +74,22 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode) if (unlikely(!get_page_unless_zero(page))) goto out; + if (unlikely(PageSlab(page))) + goto out_putpage; + /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */ + smp_rmb(); /* - * Check PageMovable before holding a PG_lock because page's owner - * assumes anybody doesn't touch PG_lock of newly allocated page - * so unconditionally grabbing the lock ruins page's owner side. + * Check movable flag before taking the page lock because + * we use non-atomic bitops on newly allocated page flags so + * unconditionally grabbing the lock ruins page's owner side. */ if (unlikely(!__PageMovable(page))) goto out_putpage; + /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */ + smp_rmb(); + if (unlikely(PageSlab(page))) + goto out_putpage; + /* * As movable pages are not isolated from LRU lists, concurrent * compaction threads can race against page migration functions @@ -820,6 +829,7 @@ int buffer_migrate_folio_norefs(struct address_space *mapping, { return __buffer_migrate_folio(mapping, dst, src, mode, true); } +EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs); #endif int filemap_migrate_folio(struct address_space *mapping, @@ -1150,79 +1160,79 @@ out: } /* - * Obtain the lock on page, remove all ptes and migrate the page - * to the newly allocated page in newpage. + * Obtain the lock on folio, remove all ptes and migrate the folio + * to the newly allocated folio in dst. */ static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, - unsigned long private, struct page *page, + unsigned long private, struct folio *src, int force, enum migrate_mode mode, enum migrate_reason reason, struct list_head *ret) { - struct folio *dst, *src = page_folio(page); + struct folio *dst; int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; - if (!thp_migration_supported() && PageTransHuge(page)) + if (!thp_migration_supported() && folio_test_transhuge(src)) return -ENOSYS; - if (page_count(page) == 1) { - /* Page was freed from under us. So we are done. */ - ClearPageActive(page); - ClearPageUnevictable(page); + if (folio_ref_count(src) == 1) { + /* Folio was freed from under us. So we are done. */ + folio_clear_active(src); + folio_clear_unevictable(src); /* free_pages_prepare() will clear PG_isolated. */ goto out; } - newpage = get_new_page(page, private); + newpage = get_new_page(&src->page, private); if (!newpage) return -ENOMEM; dst = page_folio(newpage); - newpage->private = 0; + dst->private = NULL; rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) - set_page_owner_migrate_reason(newpage, reason); + set_page_owner_migrate_reason(&dst->page, reason); out: if (rc != -EAGAIN) { /* - * A page that has been migrated has all references - * removed and will be freed. A page that has not been + * A folio that has been migrated has all references + * removed and will be freed. A folio that has not been * migrated will have kept its references and be restored. */ - list_del(&page->lru); + list_del(&src->lru); } /* * If migration is successful, releases reference grabbed during - * isolation. Otherwise, restore the page to right list unless + * isolation. Otherwise, restore the folio to right list unless * we want to retry. */ if (rc == MIGRATEPAGE_SUCCESS) { /* - * Compaction can migrate also non-LRU pages which are + * Compaction can migrate also non-LRU folios which are * not accounted to NR_ISOLATED_*. They can be recognized - * as __PageMovable + * as __folio_test_movable */ - if (likely(!__PageMovable(page))) - mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + - page_is_file_lru(page), -thp_nr_pages(page)); + if (likely(!__folio_test_movable(src))) + mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON + + folio_is_file_lru(src), -folio_nr_pages(src)); if (reason != MR_MEMORY_FAILURE) /* - * We release the page in page_handle_poison. + * We release the folio in page_handle_poison. */ - put_page(page); + folio_put(src); } else { if (rc != -EAGAIN) - list_add_tail(&page->lru, ret); + list_add_tail(&src->lru, ret); if (put_new_page) - put_new_page(newpage, private); + put_new_page(&dst->page, private); else - put_page(newpage); + folio_put(dst); } return rc; @@ -1298,7 +1308,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { + if (hugetlb_folio_subpool(src) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } @@ -1348,7 +1358,7 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - move_hugetlb_state(hpage, new_hpage, reason); + move_hugetlb_state(src, dst, reason); put_new_page = NULL; } @@ -1373,214 +1383,248 @@ out: return rc; } -static inline int try_split_thp(struct page *page, struct list_head *split_pages) +static inline int try_split_folio(struct folio *folio, struct list_head *split_folios) { int rc; - lock_page(page); - rc = split_huge_page_to_list(page, split_pages); - unlock_page(page); + folio_lock(folio); + rc = split_folio_to_list(folio, split_folios); + folio_unlock(folio); if (!rc) - list_move_tail(&page->lru, split_pages); + list_move_tail(&folio->lru, split_folios); return rc; } /* - * migrate_pages - migrate the pages specified in a list, to the free pages + * migrate_pages - migrate the folios specified in a list, to the free folios * supplied as the target for the page migration * - * @from: The list of pages to be migrated. - * @get_new_page: The function used to allocate free pages to be used - * as the target of the page migration. - * @put_new_page: The function used to free target pages if migration + * @from: The list of folios to be migrated. + * @get_new_page: The function used to allocate free folios to be used + * as the target of the folio migration. + * @put_new_page: The function used to free target folios if migration * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for - * page migration, if any. - * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of normal pages migrated successfully if + * folio migration, if any. + * @reason: The reason for folio migration. + * @ret_succeeded: Set to the number of folios migrated successfully if * the caller passes a non-NULL pointer. * - * The function returns after 10 attempts or if no pages are movable any more - * because the list has become empty or no retryable pages exist any more. - * It is caller's responsibility to call putback_movable_pages() to return pages + * The function returns after 10 attempts or if no folios are movable any more + * because the list has become empty or no retryable folios exist any more. + * It is caller's responsibility to call putback_movable_pages() to return folios * to the LRU or free list only if ret != 0. * - * Returns the number of {normal page, THP, hugetlb} that were not migrated, or - * an error code. The number of THP splits will be considered as the number of - * non-migrated THP, no matter how many subpages of the THP are migrated successfully. + * Returns the number of {normal folio, large folio, hugetlb} that were not + * migrated, or an error code. The number of large folio splits will be + * considered as the number of non-migrated large folio, no matter how many + * split folios of the large folio are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, enum migrate_mode mode, int reason, unsigned int *ret_succeeded) { int retry = 1; + int large_retry = 1; int thp_retry = 1; int nr_failed = 0; int nr_failed_pages = 0; int nr_retry_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; + int nr_large_failed = 0; int nr_thp_failed = 0; int nr_thp_split = 0; int pass = 0; + bool is_large = false; bool is_thp = false; - struct page *page; - struct page *page2; - int rc, nr_subpages; - LIST_HEAD(ret_pages); - LIST_HEAD(thp_split_pages); + struct folio *folio, *folio2; + int rc, nr_pages; + LIST_HEAD(ret_folios); + LIST_HEAD(split_folios); bool nosplit = (reason == MR_NUMA_MISPLACED); - bool no_subpage_counting = false; + bool no_split_folio_counting = false; trace_mm_migrate_pages_start(mode, reason); -thp_subpage_migration: - for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { +split_folio_migration: + for (pass = 0; pass < 10 && (retry || large_retry); pass++) { retry = 0; + large_retry = 0; thp_retry = 0; nr_retry_pages = 0; - list_for_each_entry_safe(page, page2, from, lru) { + list_for_each_entry_safe(folio, folio2, from, lru) { /* - * THP statistics is based on the source huge page. - * Capture required information that might get lost - * during migration. + * Large folio statistics is based on the source large + * folio. Capture required information that might get + * lost during migration. */ - is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = compound_nr(page); + is_large = folio_test_large(folio) && !folio_test_hugetlb(folio); + is_thp = is_large && folio_test_pmd_mappable(folio); + nr_pages = folio_nr_pages(folio); cond_resched(); - if (PageHuge(page)) + if (folio_test_hugetlb(folio)) rc = unmap_and_move_huge_page(get_new_page, - put_new_page, private, page, - pass > 2, mode, reason, - &ret_pages); + put_new_page, private, + &folio->page, pass > 2, mode, + reason, + &ret_folios); else rc = unmap_and_move(get_new_page, put_new_page, - private, page, pass > 2, mode, - reason, &ret_pages); + private, folio, pass > 2, mode, + reason, &ret_folios); /* * The rules are: - * Success: non hugetlb page will be freed, hugetlb - * page will be put back + * Success: non hugetlb folio will be freed, hugetlb + * folio will be put back * -EAGAIN: stay on the from list * -ENOMEM: stay on the from list * -ENOSYS: stay on the from list - * Other errno: put on ret_pages list then splice to + * Other errno: put on ret_folios list then splice to * from list */ switch(rc) { /* - * THP migration might be unsupported or the - * allocation could've failed so we should - * retry on the same page with the THP split - * to base pages. + * Large folio migration might be unsupported or + * the allocation could've failed so we should retry + * on the same folio with the large folio split + * to normal folios. * - * Sub-pages are put in thp_split_pages, and + * Split folios are put in split_folios, and * we will migrate them after the rest of the * list is processed. */ case -ENOSYS: - /* THP migration is unsupported */ - if (is_thp) { - nr_thp_failed++; - if (!try_split_thp(page, &thp_split_pages)) { - nr_thp_split++; + /* Large folio migration is unsupported */ + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + if (!try_split_folio(folio, &split_folios)) { + nr_thp_split += is_thp; break; } /* Hugetlb migration is unsupported */ - } else if (!no_subpage_counting) { + } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_subpages; - list_move_tail(&page->lru, &ret_pages); + nr_failed_pages += nr_pages; + list_move_tail(&folio->lru, &ret_folios); break; case -ENOMEM: /* * When memory is low, don't bother to try to migrate - * other pages, just exit. + * other folios, just exit. */ - if (is_thp) { - nr_thp_failed++; - /* THP NUMA faulting doesn't split THP to retry. */ - if (!nosplit && !try_split_thp(page, &thp_split_pages)) { - nr_thp_split++; - break; + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + /* Large folio NUMA faulting doesn't split to retry. */ + if (!nosplit) { + int ret = try_split_folio(folio, &split_folios); + + if (!ret) { + nr_thp_split += is_thp; + break; + } else if (reason == MR_LONGTERM_PIN && + ret == -EAGAIN) { + /* + * Try again to split large folio to + * mitigate the failure of longterm pinning. + */ + large_retry++; + thp_retry += is_thp; + nr_retry_pages += nr_pages; + break; + } } - } else if (!no_subpage_counting) { + } else if (!no_split_folio_counting) { nr_failed++; } - nr_failed_pages += nr_subpages + nr_retry_pages; + nr_failed_pages += nr_pages + nr_retry_pages; /* - * There might be some subpages of fail-to-migrate THPs - * left in thp_split_pages list. Move them back to migration + * There might be some split folios of fail-to-migrate large + * folios left in split_folios list. Move them back to migration * list so that they could be put back to the right list by - * the caller otherwise the page refcnt will be leaked. + * the caller otherwise the folio refcnt will be leaked. */ - list_splice_init(&thp_split_pages, from); + list_splice_init(&split_folios, from); /* nr_failed isn't updated for not used */ + nr_large_failed += large_retry; nr_thp_failed += thp_retry; goto out; case -EAGAIN: - if (is_thp) - thp_retry++; - else if (!no_subpage_counting) + if (is_large) { + large_retry++; + thp_retry += is_thp; + } else if (!no_split_folio_counting) { retry++; - nr_retry_pages += nr_subpages; + } + nr_retry_pages += nr_pages; break; case MIGRATEPAGE_SUCCESS: - nr_succeeded += nr_subpages; - if (is_thp) - nr_thp_succeeded++; + nr_succeeded += nr_pages; + nr_thp_succeeded += is_thp; break; default: /* * Permanent failure (-EBUSY, etc.): - * unlike -EAGAIN case, the failed page is - * removed from migration page list and not + * unlike -EAGAIN case, the failed folio is + * removed from migration folio list and not * retried in the next outer loop. */ - if (is_thp) - nr_thp_failed++; - else if (!no_subpage_counting) + if (is_large) { + nr_large_failed++; + nr_thp_failed += is_thp; + } else if (!no_split_folio_counting) { nr_failed++; + } - nr_failed_pages += nr_subpages; + nr_failed_pages += nr_pages; break; } } } nr_failed += retry; + nr_large_failed += large_retry; nr_thp_failed += thp_retry; nr_failed_pages += nr_retry_pages; /* - * Try to migrate subpages of fail-to-migrate THPs, no nr_failed - * counting in this round, since all subpages of a THP is counted - * as 1 failure in the first round. + * Try to migrate split folios of fail-to-migrate large folios, no + * nr_failed counting in this round, since all split folios of a + * large folio is counted as 1 failure in the first round. */ - if (!list_empty(&thp_split_pages)) { + if (!list_empty(&split_folios)) { /* - * Move non-migrated pages (after 10 retries) to ret_pages + * Move non-migrated folios (after 10 retries) to ret_folios * to avoid migrating them again. */ - list_splice_init(from, &ret_pages); - list_splice_init(&thp_split_pages, from); - no_subpage_counting = true; + list_splice_init(from, &ret_folios); + list_splice_init(&split_folios, from); + no_split_folio_counting = true; retry = 1; - goto thp_subpage_migration; + goto split_folio_migration; } - rc = nr_failed + nr_thp_failed; + rc = nr_failed + nr_large_failed; out: /* - * Put the permanent failure page back to migration list, they + * Put the permanent failure folio back to migration list, they * will be put back to the right list by the caller. */ - list_splice(&ret_pages, from); + list_splice(&ret_folios, from); + + /* + * Return 0 in case all split folios of fail-to-migrate large folios + * are migrated successfully. + */ + if (list_empty(from)) + rc = 0; count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); @@ -1613,7 +1657,7 @@ struct page *alloc_migration_target(struct page *page, unsigned long private) nid = folio_nid(folio); if (folio_test_hugetlb(folio)) { - struct hstate *h = page_hstate(&folio->page); + struct hstate *h = folio_hstate(folio); gfp_mask = htlb_modify_alloc_mask(h, gfp_mask); return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask); @@ -1879,7 +1923,6 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); - unsigned int foll_flags = FOLL_DUMP; struct vm_area_struct *vma; struct page *page; int err = -EFAULT; @@ -1888,12 +1931,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!vma) goto set_status; - /* Not all huge page follow APIs support 'FOLL_GET' */ - if (!is_vm_hugetlb_page(vma)) - foll_flags |= FOLL_GET; - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, foll_flags); + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); err = PTR_ERR(page); if (IS_ERR(page)) @@ -1906,8 +1945,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (!is_zone_device_page(page)) err = page_to_nid(page); - if (foll_flags & FOLL_GET) - put_page(page); + put_page(page); set_status: *status = err; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6fa682eef7a0..721b2365dbca 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -357,7 +357,8 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) } /* - * Unmaps pages for migration. Returns number of unmapped pages. + * Unmaps pages for migration. Returns number of source pfns marked as + * migrating. */ static unsigned long migrate_device_unmap(unsigned long *src_pfns, unsigned long npages, @@ -373,8 +374,11 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns, struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; - if (!page) + if (!page) { + if (src_pfns[i] & MIGRATE_PFN_MIGRATE) + unmapped++; continue; + } /* ZONE_DEVICE pages are not on LRU */ if (!is_zone_device_page(page)) { diff --git a/mm/mincore.c b/mm/mincore.c index fa200c14185f..a085a2aeabd8 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -52,7 +52,7 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) { unsigned char present = 0; - struct page *page; + struct folio *folio; /* * When tmpfs swaps out a page from a file, any process mapping that @@ -60,10 +60,10 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t index) * any other file mapping (ie. marked !present and faulted in with * tmpfs's .fault). So swapped out tmpfs mappings are tested here. */ - page = find_get_incore_page(mapping, index); - if (page) { - present = PageUptodate(page); - put_page(page); + folio = filemap_get_incore_folio(mapping, index); + if (folio) { + present = folio_test_uptodate(folio); + folio_put(folio); } return present; @@ -190,8 +190,8 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v unsigned long end; int err; - vma = find_vma(current->mm, addr); - if (!vma || addr < vma->vm_start) + vma = vma_lookup(current->mm, addr); + if (!vma) return -ENOMEM; end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); if (!can_do_mincore(vma)) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 0d7b2bd2454a..c1883362e71d 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -178,16 +178,10 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, return NOTIFY_OK; } -static struct notifier_block compute_batch_nb __meminitdata = { - .notifier_call = mm_compute_batch_notifier, - .priority = IPC_CALLBACK_PRI, /* use lowest priority */ -}; - static int __init mm_compute_batch_init(void) { mm_compute_batch(sysctl_overcommit_memory); - register_hotmemory_notifier(&compute_batch_nb); - + hotplug_memory_notifier(mm_compute_batch_notifier, MM_COMPUTE_BATCH_PRI); return 0; } diff --git a/mm/mmap.c b/mm/mmap.c index bf2122af94e7..87d929316d57 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -226,8 +226,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* Search one past newbrk */ mas_set(&mas, newbrk); brkvma = mas_find(&mas, oldbrk); - BUG_ON(brkvma == NULL); - if (brkvma->vm_start >= oldbrk) + if (!brkvma || brkvma->vm_start >= oldbrk) goto out; /* mapping intersects with an existing non-brk vma. */ /* * mm->brk must be protected by write mmap_lock. @@ -456,7 +455,7 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) * vma_mas_szero() - Set a given range to zero. Used when modifying a * vm_area_struct start or end. * - * @mm: The struct_mm + * @mas: The maple tree ma_state * @start: The start address to zero * @end: The end address to zero. */ @@ -618,7 +617,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *next_next = NULL; /* uninit var warning */ + struct vm_area_struct *next = find_vma(mm, vma->vm_end); struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; @@ -1778,9 +1778,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, */ pgoff = 0; get_area = shmem_get_unmapped_area; - } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { - /* Ensures that larger anonymous mappings are THP aligned. */ - get_area = thp_get_unmapped_area; } addr = get_area(file, addr, len, pgoff, flags); @@ -2625,14 +2622,14 @@ cannot_expand: if (error) goto unmap_and_free_vma; - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM + /* + * Expansion is handled above, merging is handled below. + * Drivers should not alter the address of the VMA. */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; + if (WARN_ON((addr != vma->vm_start))) { + error = -EINVAL; + goto close_and_free_vma; + } mas_reset(&mas); /* @@ -2654,7 +2651,6 @@ cannot_expand: vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ - addr = vma->vm_start; vm_flags = vma->vm_flags; goto unmap_writable; } @@ -2674,6 +2670,8 @@ cannot_expand: error = -EINVAL; if (file) goto close_and_free_vma; + else if (vma->vm_file) + goto unmap_and_free_vma; else goto free_vma; } @@ -2681,6 +2679,8 @@ cannot_expand: if (mas_preallocate(&mas, vma, GFP_KERNEL)) { error = -ENOMEM; if (file) + goto close_and_free_vma; + else if (vma->vm_file) goto unmap_and_free_vma; else goto free_vma; @@ -2751,7 +2751,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); - if (vm_flags & VM_SHARED) + if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); free_vma: vm_area_free(vma); @@ -2852,6 +2852,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } @@ -2942,12 +2945,12 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ - if (vma && - (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && - ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { + if (vma && vma->vm_end == addr && !vma_policy(vma) && + can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { mas_set_range(mas, vma->vm_start, addr + len - 1); if (mas_preallocate(mas, vma, GFP_KERNEL)) - return -ENOMEM; + goto unacct_fail; vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); if (vma->anon_vma) { @@ -2969,7 +2972,7 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) - goto vma_alloc_fail; + goto unacct_fail; vma_set_anonymous(vma); vma->vm_start = addr; @@ -2994,7 +2997,7 @@ out: mas_store_fail: vm_area_free(vma); -vma_alloc_fail: +unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } @@ -3031,11 +3034,6 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) goto munmap_failed; vma = mas_prev(&mas, 0); - if (!vma || vma->vm_end != addr || vma_policy(vma) || - !can_vma_merge_after(vma, flags, NULL, NULL, - addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) - vma = NULL; - ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); @@ -3746,13 +3744,9 @@ static int reserve_mem_notifier(struct notifier_block *nb, return NOTIFY_OK; } -static struct notifier_block reserve_mem_nb = { - .notifier_call = reserve_mem_notifier, -}; - static int __meminit init_reserve_notifier(void) { - if (register_hotmemory_notifier(&reserve_mem_nb)) + if (hotplug_memory_notifier(reserve_mem_notifier, DEFAULT_CALLBACK_PRI)) pr_err("Failed registering memory add/remove notifier for admin reserve\n"); return 0; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index add4244e5790..2b93cf6ac9ae 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -1,7 +1,6 @@ #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/kernel.h> -#include <linux/kmsan-checks.h> #include <linux/mmdebug.h> #include <linux/mm_types.h> #include <linux/mm_inline.h> @@ -9,6 +8,7 @@ #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> +#include <linux/rmap.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -19,6 +19,10 @@ static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; + /* Limit batching if we have delayed rmaps pending */ + if (tlb->delayed_rmap && tlb->active != &tlb->local) + return false; + batch = tlb->active; if (batch->next) { tlb->active = batch->next; @@ -43,12 +47,46 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +#ifdef CONFIG_SMP +static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma) +{ + for (int i = 0; i < batch->nr; i++) { + struct encoded_page *enc = batch->encoded_pages[i]; + + if (encoded_page_flags(enc)) { + struct page *page = encoded_page_ptr(enc); + page_remove_rmap(page, vma, false); + } + } +} + +/** + * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB + * @tlb: the current mmu_gather + * + * Note that because of how tlb_next_batch() above works, we will + * never start multiple new batches with pending delayed rmaps, so + * we only need to walk through the current active batch and the + * original local one. + */ +void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) +{ + if (!tlb->delayed_rmap) + return; + + tlb_flush_rmap_batch(&tlb->local, vma); + if (tlb->active != &tlb->local) + tlb_flush_rmap_batch(tlb->active, vma); + tlb->delayed_rmap = 0; +} +#endif + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -77,7 +115,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct encoded_page *page, int page_size) { struct mmu_gather_batch *batch; @@ -92,13 +130,13 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } - VM_BUG_ON_PAGE(batch->nr > batch->max, page); + VM_BUG_ON_PAGE(batch->nr > batch->max, encoded_page_ptr(page)); return false; } @@ -153,7 +191,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ } -static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -177,8 +215,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ -static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); @@ -266,15 +302,6 @@ void tlb_flush_mmu(struct mmu_gather *tlb) static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { - /* - * struct mmu_gather contains 7 1-bit fields packed into a 32-bit - * unsigned int value. The remaining 25 bits remain uninitialized - * and are never used, but KMSAN updates the origin for them in - * zap_pXX_range() in mm/memory.c, thus creating very long origin - * chains. This is technically correct, but consumes too much memory. - * Unpoisoning the whole structure will prevent creating such chains. - */ - kmsan_unpoison_memory(tlb, sizeof(*tlb)); tlb->mm = mm; tlb->fullmm = fullmm; @@ -286,6 +313,7 @@ static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, tlb->active = &tlb->local; tlb->batch_count = 0; #endif + tlb->delayed_rmap = 0; tlb_table_init(tlb); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE diff --git a/mm/mprotect.c b/mm/mprotect.c index 668bfaa6ed2a..908df12caa26 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -39,14 +39,16 @@ #include "internal.h" -static inline bool can_change_pte_writable(struct vm_area_struct *vma, - unsigned long addr, pte_t pte) +bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr, + pte_t pte) { struct page *page; - VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte)); + if (WARN_ON_ONCE(!(vma->vm_flags & VM_WRITE))) + return false; - if (pte_protnone(pte) || !pte_dirty(pte)) + /* Don't touch entries that are not even readable. */ + if (pte_protnone(pte)) return false; /* Do we need write faults for softdirty tracking? */ @@ -59,17 +61,23 @@ static inline bool can_change_pte_writable(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_SHARED)) { /* - * We can only special-case on exclusive anonymous pages, - * because we know that our write-fault handler similarly would - * map them writable without any additional checks while holding - * the PT lock. + * Writable MAP_PRIVATE mapping: We can only special-case on + * exclusive anonymous pages, because we know that our + * write-fault handler similarly would map them writable without + * any additional checks while holding the PT lock. */ page = vm_normal_page(vma, addr, pte); - if (!page || !PageAnon(page) || !PageAnonExclusive(page)) - return false; + return page && PageAnon(page) && PageAnonExclusive(page); } - return true; + /* + * Writable MAP_SHARED mapping: "clean" might indicate that the FS still + * needs a real write-fault for writenotify + * (see vma_wants_writenotify()). If "dirty", the assumption is that the + * FS was already notified and we can simply mark the PTE writable + * just like the write-fault handler would do. + */ + return pte_dirty(pte); } static unsigned long change_pte_range(struct mmu_gather *tlb, @@ -113,7 +121,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = *pte; if (pte_present(oldpte)) { pte_t ptent; - bool preserve_write = prot_numa && pte_write(oldpte); /* * Avoid trapping faults against the zero or KSM @@ -169,8 +176,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, oldpte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_modify(oldpte, newprot); - if (preserve_write) - ptent = pte_mk_savedwrite(ptent); if (uffd_wp) { ptent = pte_wrprotect(ptent); @@ -267,7 +272,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); -#ifdef CONFIG_PTE_MARKER_UFFD_WP if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { /* * For file-backed mem, we need to be able to @@ -279,7 +283,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, make_pte_marker(PTE_MARKER_UFFD_WP)); pages++; } -#endif } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -294,7 +297,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, */ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) { - pmd_t pmdval = pmd_read_atomic(pmd); + pmd_t pmdval = pmdp_get_lockless(pmd); /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -552,8 +555,8 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; + unsigned int mm_cp_flags = 0; unsigned long charged = 0; - bool try_change_writable; pgoff_t pgoff; int error; @@ -631,20 +634,11 @@ success: * held in write mode. */ vma->vm_flags = newflags; - /* - * We want to check manually if we can change individual PTEs writable - * if we can't do that automatically for all PTEs in a mapping. For - * private mappings, that's always the case when we have write - * permissions as we properly have to handle COW. - */ - if (vma->vm_flags & VM_SHARED) - try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot); - else - try_change_writable = !!(vma->vm_flags & VM_WRITE); + if (vma_wants_manual_pte_write_upgrade(vma)) + mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; vma_set_page_prot(vma); - change_protection(tlb, vma, start, end, vma->vm_page_prot, - try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0); + change_protection(tlb, vma, start, end, vma->vm_page_prot, mm_cp_flags); /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major @@ -756,8 +750,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, * If a permission is not passed to mprotect(), it must be * cleared from the VMA. */ - mask_off_old_flags = VM_READ | VM_WRITE | VM_EXEC | - VM_FLAGS_CLEAR; + mask_off_old_flags = VM_ACCESS_FLAGS | VM_FLAGS_CLEAR; new_vma_pkey = arch_override_mprotect_pkey(vma, prot, pkey); newflags = calc_vm_prot_bits(prot, new_vma_pkey); diff --git a/mm/mremap.c b/mm/mremap.c index e465ffe279bb..fe587c5d6591 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -1016,7 +1016,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, long pages = (new_len - old_len) >> PAGE_SHIFT; unsigned long extension_start = addr + old_len; unsigned long extension_end = addr + new_len; - pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT); + pgoff_t extension_pgoff = vma->vm_pgoff + + ((extension_start - vma->vm_start) >> PAGE_SHIFT); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7e9d8d857ecc..ad608ef2a243 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -13,6 +13,7 @@ */ #include <linux/kernel.h> +#include <linux/math64.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/fs.h> @@ -197,7 +198,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, min *= this_bw; min = div64_ul(min, tot_bw); } - if (max < 100) { + if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } @@ -650,11 +651,49 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + +static u64 bdi_get_bytes(unsigned int ratio) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + +static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + min_ratio *= BDI_RATIO_SCALE; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; @@ -665,7 +704,7 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) bdi->min_ratio = min_ratio; } else { delta = min_ratio - bdi->min_ratio; - if (bdi_min_ratio + delta < 100) { + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { bdi_min_ratio += delta; bdi->min_ratio = min_ratio; } else { @@ -678,11 +717,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) return ret; } -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; - if (max_ratio > 100) + if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); @@ -696,8 +735,81 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) return ret; } + +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); +} EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + unsigned long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_min_ratio(bdi, min_ratio); +} + +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + unsigned long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -760,15 +872,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); - wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - wb_thresh += (thresh * wb_min_ratio) / 100; - if (wb_thresh > (thresh * wb_max_ratio) / 100) - wb_thresh = thresh * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) + wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); return wb_thresh; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e20ade858e71..0745aedebb37 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -170,21 +170,12 @@ static DEFINE_MUTEX(pcp_batch_high_lock); _ret; \ }) -#define pcpu_spin_lock_irqsave(type, member, ptr, flags) \ +#define pcpu_spin_trylock(type, member, ptr) \ ({ \ type *_ret; \ pcpu_task_pin(); \ _ret = this_cpu_ptr(ptr); \ - spin_lock_irqsave(&_ret->member, flags); \ - _ret; \ -}) - -#define pcpu_spin_trylock_irqsave(type, member, ptr, flags) \ -({ \ - type *_ret; \ - pcpu_task_pin(); \ - _ret = this_cpu_ptr(ptr); \ - if (!spin_trylock_irqsave(&_ret->member, flags)) { \ + if (!spin_trylock(&_ret->member)) { \ pcpu_task_unpin(); \ _ret = NULL; \ } \ @@ -197,27 +188,16 @@ static DEFINE_MUTEX(pcp_batch_high_lock); pcpu_task_unpin(); \ }) -#define pcpu_spin_unlock_irqrestore(member, ptr, flags) \ -({ \ - spin_unlock_irqrestore(&ptr->member, flags); \ - pcpu_task_unpin(); \ -}) - /* struct per_cpu_pages specific helpers. */ #define pcp_spin_lock(ptr) \ pcpu_spin_lock(struct per_cpu_pages, lock, ptr) -#define pcp_spin_lock_irqsave(ptr, flags) \ - pcpu_spin_lock_irqsave(struct per_cpu_pages, lock, ptr, flags) - -#define pcp_spin_trylock_irqsave(ptr, flags) \ - pcpu_spin_trylock_irqsave(struct per_cpu_pages, lock, ptr, flags) +#define pcp_spin_trylock(ptr) \ + pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) #define pcp_spin_unlock(ptr) \ pcpu_spin_unlock(lock, ptr) -#define pcp_spin_unlock_irqrestore(ptr, flags) \ - pcpu_spin_unlock_irqrestore(lock, ptr, flags) #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -798,6 +778,7 @@ static void prep_compound_head(struct page *page, unsigned int order) set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); set_compound_order(page, order); atomic_set(compound_mapcount_ptr(page), -1); + atomic_set(subpages_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); } @@ -807,6 +788,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) @@ -1323,11 +1305,19 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping may be compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { + /* the first tail page: these may be in place of ->mapping */ + if (unlikely(head_compound_mapcount(head_page))) { bad_page(page, "nonzero compound_mapcount"); goto out; } + if (unlikely(atomic_read(subpages_mapcount_ptr(head_page)))) { + bad_page(page, "nonzero subpages_mapcount"); + goto out; + } + if (unlikely(head_compound_pincount(head_page))) { + bad_page(page, "nonzero compound_pincount"); + goto out; + } break; case 2: /* @@ -1430,10 +1420,8 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - if (compound) { - ClearPageDoubleMap(page); + if (compound) ClearPageHasHWPoisoned(page); - } for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); @@ -1546,6 +1534,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp, int pindex) { + unsigned long flags; int min_pindex = 0; int max_pindex = NR_PCP_LISTS - 1; unsigned int order; @@ -1561,8 +1550,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Ensure requested pindex is drained first. */ pindex = pindex - 1; - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); isolated_pageblocks = has_isolate_pageblock(zone); while (count > 0) { @@ -1610,7 +1598,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, } while (count > 0 && !list_empty(list)); } - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); } static void free_one_page(struct zone *zone, @@ -1714,6 +1702,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, if (!free_pages_prepare(page, order, true, fpi_flags)) return; + /* + * Calling get_pfnblock_migratetype() without spin_lock_irqsave() here + * is used to avoid calling get_pfnblock_migratetype() under the lock. + * This will reduce the lock holding time. + */ migratetype = get_pfnblock_migratetype(page, pfn); spin_lock_irqsave(&zone->lock, flags); @@ -3124,10 +3117,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, unsigned int alloc_flags) { + unsigned long flags; int i, allocated = 0; - /* Caller must hold IRQ-safe pcp->lock so IRQs are disabled. */ - spin_lock(&zone->lock); + spin_lock_irqsave(&zone->lock, flags); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype, alloc_flags); @@ -3161,7 +3154,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock(&zone->lock); + spin_unlock_irqrestore(&zone->lock, flags); return allocated; } @@ -3178,16 +3171,9 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) { - unsigned long flags; - - /* - * free_pcppages_bulk expects IRQs disabled for zone->lock - * so even though pcp->lock is not intended to be IRQ-safe, - * it's needed in this context. - */ - spin_lock_irqsave(&pcp->lock, flags); + spin_lock(&pcp->lock); free_pcppages_bulk(zone, to_drain, pcp, 0); - spin_unlock_irqrestore(&pcp->lock, flags); + spin_unlock(&pcp->lock); } } #endif @@ -3201,12 +3187,9 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); if (pcp->count) { - unsigned long flags; - - /* See drain_zone_pages on why this is disabling IRQs */ - spin_lock_irqsave(&pcp->lock, flags); + spin_lock(&pcp->lock); free_pcppages_bulk(zone, pcp->count, pcp, 0); - spin_unlock_irqrestore(&pcp->lock, flags); + spin_unlock(&pcp->lock); } } @@ -3472,7 +3455,6 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, */ void free_unref_page(struct page *page, unsigned int order) { - unsigned long flags; unsigned long __maybe_unused UP_flags; struct per_cpu_pages *pcp; struct zone *zone; @@ -3500,10 +3482,10 @@ void free_unref_page(struct page *page, unsigned int order) zone = page_zone(page); pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (pcp) { free_unref_page_commit(zone, pcp, page, migratetype, order); - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); } else { free_one_page(zone, page, pfn, order, migratetype, FPI_NONE); } @@ -3515,10 +3497,10 @@ void free_unref_page(struct page *page, unsigned int order) */ void free_unref_page_list(struct list_head *list) { + unsigned long __maybe_unused UP_flags; struct page *page, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - unsigned long flags; int batch_count = 0; int migratetype; @@ -3545,39 +3527,54 @@ void free_unref_page_list(struct list_head *list) list_for_each_entry_safe(page, next, list, lru) { struct zone *zone = page_zone(page); - /* Different zone, different pcp lock. */ - if (zone != locked_zone) { - if (pcp) - pcp_spin_unlock_irqrestore(pcp, flags); + list_del(&page->lru); + migratetype = get_pcppage_migratetype(page); + + /* + * Either different zone requiring a different pcp lock or + * excessive lock hold times when freeing a large list of + * pages. + */ + if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } + batch_count = 0; + + /* + * trylock is necessary as pages may be getting freed + * from IRQ or SoftIRQ context after an IO completion. + */ + pcp_trylock_prepare(UP_flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); + if (unlikely(!pcp)) { + pcp_trylock_finish(UP_flags); + free_one_page(zone, page, page_to_pfn(page), + 0, migratetype, FPI_NONE); + locked_zone = NULL; + continue; + } locked_zone = zone; - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); } /* * Non-isolated types over MIGRATE_PCPTYPES get added * to the MIGRATE_MOVABLE pcp list. */ - migratetype = get_pcppage_migratetype(page); if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(page); free_unref_page_commit(zone, pcp, page, migratetype, 0); - - /* - * Guard against excessive IRQ disabled times when we get - * a large list of pages to free. - */ - if (++batch_count == SWAP_CLUSTER_MAX) { - pcp_spin_unlock_irqrestore(pcp, flags); - batch_count = 0; - pcp = pcp_spin_lock_irqsave(locked_zone->per_cpu_pageset, flags); - } + batch_count++; } - if (pcp) - pcp_spin_unlock_irqrestore(pcp, flags); + if (pcp) { + pcp_spin_unlock(pcp); + pcp_trylock_finish(UP_flags); + } } /* @@ -3778,15 +3775,11 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct per_cpu_pages *pcp; struct list_head *list; struct page *page; - unsigned long flags; unsigned long __maybe_unused UP_flags; - /* - * spin_trylock may fail due to a parallel drain. In the future, the - * trylock will also protect against IRQ reentrancy. - */ + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) { pcp_trylock_finish(UP_flags); return NULL; @@ -3800,7 +3793,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp->free_factor >>= 1; list = &pcp->lists[order_to_pindex(migratetype, order)]; page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); @@ -3886,6 +3879,8 @@ __setup("fail_page_alloc=", setup_fail_page_alloc); static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { + int flags = 0; + if (order < fail_page_alloc.min_order) return false; if (gfp_mask & __GFP_NOFAIL) @@ -3896,10 +3891,11 @@ static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) (gfp_mask & __GFP_DIRECT_RECLAIM)) return false; + /* See comment in __should_failslab() */ if (gfp_mask & __GFP_NOWARN) - fail_page_alloc.attr.no_warn = true; + flags |= FAULT_NOWARN; - return should_fail(&fail_page_alloc.attr, 1 << order); + return should_fail_ex(&fail_page_alloc.attr, 1 << order, flags); } #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS @@ -5368,7 +5364,6 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, struct page **page_array) { struct page *page; - unsigned long flags; unsigned long __maybe_unused UP_flags; struct zone *zone; struct zoneref *z; @@ -5450,9 +5445,9 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!zone)) goto failed; - /* Is a parallel drain in progress? */ + /* spin_trylock may fail due to a parallel drain or IRQ reentrancy. */ pcp_trylock_prepare(UP_flags); - pcp = pcp_spin_trylock_irqsave(zone->per_cpu_pageset, flags); + pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (!pcp) goto failed_irq; @@ -5471,7 +5466,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, if (unlikely(!page)) { /* Try and allocate at least one page */ if (!nr_account) { - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); goto failed_irq; } break; @@ -5486,7 +5481,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, nr_populated++; } - pcp_spin_unlock_irqrestore(pcp, flags); + pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); __count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account); @@ -5784,14 +5779,18 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) { if (addr) { - unsigned long alloc_end = addr + (PAGE_SIZE << order); - unsigned long used = addr + PAGE_ALIGN(size); + unsigned long nr = DIV_ROUND_UP(size, PAGE_SIZE); + struct page *page = virt_to_page((void *)addr); + struct page *last = page + nr; - split_page(virt_to_page((void *)addr), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + split_page_owner(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); + + last = page + (1UL << order); + for (page += nr; page < last; page++) + __free_pages_ok(page, 0, FPI_TO_TAIL); } return (void *)addr; } @@ -6866,13 +6865,11 @@ static void __ref memmap_init_compound(struct page *head, set_page_count(page, 0); /* - * The first tail page stores compound_mapcount_ptr() and - * compound_order() and the second tail page stores - * compound_pincount_ptr(). Call prep_compound_head() after - * the first and second tail pages have been initialized to - * not have the data overwritten. + * The first tail page stores important compound page info. + * Call prep_compound_head() after the first tail page has + * been initialized, to not have the data overwritten. */ - if (pfn == head_pfn + 2) + if (pfn == head_pfn + 1) prep_compound_head(head, order); } } diff --git a/mm/page_ext.c b/mm/page_ext.c index affe80243b6d..4ee522fd381c 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -166,7 +166,7 @@ struct page_ext *page_ext_get(struct page *page) /** * page_ext_put() - Working with page extended information is done. - * @page_ext - Page extended information received from page_ext_get(). + * @page_ext: Page extended information received from page_ext_get(). * * The page extended information of the page may not be valid after this * function is called. @@ -513,7 +513,7 @@ void __init page_ext_init(void) cond_resched(); } } - hotplug_memory_notifier(page_ext_callback, 0); + hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI); pr_info("allocated %ld bytes of page_ext\n", total_usage); invoke_init_callbacks(); return; diff --git a/mm/page_io.c b/mm/page_io.c index 2af34dd8fa4d..3a5f921b932e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -376,7 +376,7 @@ void swap_write_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, WRITE, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_SOURCE, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_write_complete(&sio->iocb, ret); @@ -530,7 +530,7 @@ void __swap_read_unplug(struct swap_iocb *sio) struct address_space *mapping = sio->iocb.ki_filp->f_mapping; int ret; - iov_iter_bvec(&from, READ, sio->bvec, sio->pages, sio->len); + iov_iter_bvec(&from, ITER_DEST, sio->bvec, sio->pages, sio->len); ret = mapping->a_ops->swap_rw(&sio->iocb, &from); if (ret != -EIOCBQUEUED) sio_read_complete(&sio->iocb, ret); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { diff --git a/mm/page_reporting.c b/mm/page_reporting.c index 382958eef8a9..79a8554f024c 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -11,10 +11,42 @@ #include "page_reporting.h" #include "internal.h" -unsigned int page_reporting_order = MAX_ORDER; -module_param(page_reporting_order, uint, 0644); +/* Initialize to an unsupported value */ +unsigned int page_reporting_order = -1; + +static int page_order_update_notify(const char *val, const struct kernel_param *kp) +{ + /* + * If param is set beyond this limit, order is set to default + * pageblock_order value + */ + return param_set_uint_minmax(val, kp, 0, MAX_ORDER-1); +} + +static const struct kernel_param_ops page_reporting_param_ops = { + .set = &page_order_update_notify, + /* + * For the get op, use param_get_int instead of param_get_uint. + * This is to make sure that when unset the initialized value of + * -1 is shown correctly + */ + .get = ¶m_get_int, +}; + +module_param_cb(page_reporting_order, &page_reporting_param_ops, + &page_reporting_order, 0644); MODULE_PARM_DESC(page_reporting_order, "Set page reporting order"); +/* + * This symbol is also a kernel parameter. Export the page_reporting_order + * symbol so that other drivers can access it to control order values without + * having to introduce another configurable parameter. Only one driver can + * register with the page_reporting driver for the service, so we have just + * one control parameter for the use case(which can be accessed in both + * drivers) + */ +EXPORT_SYMBOL_GPL(page_reporting_order); + #define PAGE_REPORTING_DELAY (2 * HZ) static struct page_reporting_dev_info __rcu *pr_dev_info __read_mostly; @@ -330,10 +362,18 @@ int page_reporting_register(struct page_reporting_dev_info *prdev) } /* - * Update the page reporting order if it's specified by driver. - * Otherwise, it falls back to @pageblock_order. + * If the page_reporting_order value is not set, we check if + * an order is provided from the driver that is performing the + * registration. If that is not provided either, we default to + * pageblock_order. */ - page_reporting_order = prdev->order ? : pageblock_order; + + if (page_reporting_order == -1) { + if (prdev->order > 0 && prdev->order <= MAX_ORDER) + page_reporting_order = prdev->order; + else + page_reporting_order = pageblock_order; + } /* initialize state and work structures */ atomic_set(&prdev->state, PAGE_REPORTING_IDLE); diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 433dbce13fe1..93e633c1d587 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -4,6 +4,7 @@ * Copyright (c) 2021, Google LLC. * Pasha Tatashin <pasha.tatashin@soleen.com> */ +#include <linux/kstrtox.h> #include <linux/mm.h> #include <linux/page_table_check.h> @@ -23,7 +24,7 @@ EXPORT_SYMBOL(page_table_check_disabled); static int __init early_page_table_check_param(char *buf) { - return strtobool(buf, &__page_table_check_enabled); + return kstrtobool(buf, &__page_table_check_enabled); } early_param("page_table_check", early_page_table_check_param); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2ff3a5bebceb..7f1c9b274906 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -517,6 +517,26 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, return walk_pgd_range(start, end, &walk); } +int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + + if (start >= end || !walk.mm) + return -EINVAL; + if (start < vma->vm_start || end > vma->vm_end) + return -EINVAL; + + mmap_assert_locked(walk.mm); + return __walk_page_range(start, end, &walk); +} + int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private) { @@ -526,18 +546,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, .vma = vma, .private = private, }; - int err; if (!walk.mm) return -EINVAL; mmap_assert_locked(walk.mm); - - err = walk_page_test(vma->vm_start, vma->vm_end, &walk); - if (err > 0) - return 0; - if (err < 0) - return err; return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } diff --git a/mm/percpu.c b/mm/percpu.c index 27697b2429c2..acd78da0493b 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -72,7 +72,6 @@ #include <linux/cpumask.h> #include <linux/memblock.h> #include <linux/err.h> -#include <linux/lcm.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> @@ -174,9 +173,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ -/* chunks which need their map areas extended, protected by pcpu_lock */ -static LIST_HEAD(pcpu_map_extend_chunks); - /* * The number of empty populated pages, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. @@ -834,13 +830,15 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, /* * Update s_block. - * block->first_free must be updated if the allocation takes its place. - * If the allocation breaks the contig_hint, a scan is required to - * restore this hint. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; + /* + * block->first_free must be updated if the allocation takes its place. + * If the allocation breaks the contig_hint, a scan is required to + * restore this hint. + */ if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), @@ -915,6 +913,12 @@ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, } } + /* + * If the allocation is not atomic, some blocks may not be + * populated with pages, while we account it here. The number + * of pages will be added back with pcpu_chunk_populated() + * when populating pages. + */ if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); @@ -1342,7 +1346,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; - unsigned long aligned_addr, lcm_align; + unsigned long aligned_addr; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; @@ -1350,14 +1354,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; - - /* - * Align the end of the region with the LCM of PAGE_SIZE and - * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of - * the other. - */ - lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); - region_size = ALIGN(start_offset + map_size, lcm_align); + region_size = ALIGN(start_offset + map_size, PAGE_SIZE); /* allocate chunk */ alloc_size = struct_size(chunk, populated, @@ -1820,16 +1817,12 @@ restart: spin_unlock_irqrestore(&pcpu_lock, flags); - /* - * No space left. Create a new chunk. We don't want multiple - * tasks to create chunks simultaneously. Serialize and create iff - * there's still no empty chunk after grabbing the mutex. - */ if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } + /* No space left. Create a new chunk. */ if (list_empty(&pcpu_chunk_lists[pcpu_free_slot])) { chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { @@ -2146,9 +2139,9 @@ static void pcpu_reclaim_populated(void) * other accessor is the free path which only returns area back to the * allocator not touching the populated bitmap. */ - while (!list_empty(&pcpu_chunk_lists[pcpu_to_depopulate_slot])) { - chunk = list_first_entry(&pcpu_chunk_lists[pcpu_to_depopulate_slot], - struct pcpu_chunk, list); + while ((chunk = list_first_entry_or_null( + &pcpu_chunk_lists[pcpu_to_depopulate_slot], + struct pcpu_chunk, list))) { WARN_ON(chunk->immutable); /* @@ -2166,7 +2159,7 @@ static void pcpu_reclaim_populated(void) /* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_HIGH) { reintegrate = true; - goto end_chunk; + break; } /* @@ -2202,7 +2195,6 @@ static void pcpu_reclaim_populated(void) end = -1; } -end_chunk: /* batch tlb flush per chunk to amortize cost */ if (freed_page_start < freed_page_end) { spin_unlock_irq(&pcpu_lock); diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 4bcc11958089..78dfaf9e8990 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -263,7 +263,7 @@ static ssize_t process_vm_rw(pid_t pid, struct iovec *iov_r; struct iov_iter iter; ssize_t rc; - int dir = vm_write ? WRITE : READ; + int dir = vm_write ? ITER_SOURCE : ITER_DEST; if (flags != 0) return -EINVAL; diff --git a/mm/rmap.c b/mm/rmap.c index 2ec925e5fa6a..b616870a09be 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -315,8 +315,8 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) enomem_failure: /* - * dst->anon_vma is dropped here otherwise its degree can be incorrectly - * decremented in unlink_anon_vmas(). + * dst->anon_vma is dropped here otherwise its num_active_vmas can + * be incorrectly decremented in unlink_anon_vmas(). * We can safely do this because callers of anon_vma_clone() don't care * about dst->anon_vma if anon_vma_clone() failed. */ @@ -1085,6 +1085,29 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, return page_vma_mkclean_one(&pvmw); } +int total_compound_mapcount(struct page *head) +{ + int mapcount = head_compound_mapcount(head); + int nr_subpages; + int i; + + /* In the common case, avoid the loop when no subpages mapped by PTE */ + if (head_subpages_mapcount(head) == 0) + return mapcount; + /* + * Add all the PTE mappings of those subpages mapped by PTE. + * Limit the loop, knowing that only subpages_mapcount are mapped? + * Perhaps: given all the raciness, that may be a good or a bad idea. + */ + nr_subpages = thp_nr_pages(head); + for (i = 0; i < nr_subpages; i++) + mapcount += atomic_read(&head[i]._mapcount); + + /* But each of those _mapcounts was based on -1 */ + mapcount += nr_subpages; + return mapcount; +} + /** * page_move_anon_rmap - move a page to our anon_vma * @page: the page to move to our anon_vma @@ -1194,38 +1217,50 @@ static void __page_check_anon_rmap(struct page *page, void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, rmap_t flags) { + atomic_t *mapped; + int nr = 0, nr_pmdmapped = 0; bool compound = flags & RMAP_COMPOUND; - bool first; + bool first = true; if (unlikely(PageKsm(page))) lock_page_memcg(page); - else - VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound) { - atomic_t *mapcount; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - mapcount = compound_mapcount_ptr(page); - first = atomic_inc_and_test(mapcount); - } else { + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { first = atomic_inc_and_test(&page->_mapcount); + nr = first; + if (first && PageCompound(page)) { + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = (nr < COMPOUND_MAPPED); + } + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ + + first = atomic_inc_and_test(compound_mapcount_ptr(page)); + if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); + if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of a remove and another add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* Raced ahead of a remove of COMPOUND_MAPPED */ + nr = 0; + } + } } + VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page); VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page); - if (first) { - int nr = compound ? thp_nr_pages(page) : 1; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption - * disabled. - */ - if (compound) - __mod_lruvec_page_state(page, NR_ANON_THPS, nr); + if (nr_pmdmapped) + __mod_lruvec_page_state(page, NR_ANON_THPS, nr_pmdmapped); + if (nr) __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); - } if (unlikely(PageKsm(page))) unlock_page_memcg(page); @@ -1256,22 +1291,24 @@ void page_add_anon_rmap(struct page *page, void page_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { - const bool compound = PageCompound(page); - int nr = compound ? thp_nr_pages(page) : 1; + int nr; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); __SetPageSwapBacked(page); - if (compound) { + + if (likely(!PageCompound(page))) { + /* increment count (starts at -1) */ + atomic_set(&page->_mapcount, 0); + nr = 1; + } else { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); - + atomic_set(subpages_mapcount_ptr(page), COMPOUND_MAPPED); + nr = thp_nr_pages(page); __mod_lruvec_page_state(page, NR_ANON_THPS, nr); - } else { - /* increment count (starts at -1) */ - atomic_set(&page->_mapcount, 0); } + __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr); __page_set_anon_rmap(page, vma, address, 1); } @@ -1287,45 +1324,45 @@ void page_add_new_anon_rmap(struct page *page, void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, bool compound) { - int i, nr = 0; + atomic_t *mapped; + int nr = 0, nr_pmdmapped = 0; + bool first; VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); lock_page_memcg(page); - if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); - for (i = 0; i < nr_pages; i++) { - if (atomic_inc_and_test(&page[i]._mapcount)) - nr++; + /* Is page being mapped by PTE? Is this its first map to be added? */ + if (likely(!compound)) { + first = atomic_inc_and_test(&page->_mapcount); + nr = first; + if (first && PageCompound(page)) { + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_inc_return_relaxed(mapped); + nr = (nr < COMPOUND_MAPPED); } - if (!atomic_inc_and_test(compound_mapcount_ptr(page))) - goto out; - - /* - * It is racy to ClearPageDoubleMap in page_remove_file_rmap(); - * but page lock is held by all page_add_file_rmap() compound - * callers, and SetPageDoubleMap below warns if !PageLocked: - * so here is a place that DoubleMap can be safely cleared. - */ - VM_WARN_ON_ONCE(!PageLocked(page)); - if (nr == nr_pages && PageDoubleMap(page)) - ClearPageDoubleMap(page); - - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - nr_pages); - } else { - if (PageTransCompound(page) && page_mapping(page)) { - VM_WARN_ON_ONCE(!PageLocked(page)); - SetPageDoubleMap(compound_head(page)); + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ + + first = atomic_inc_and_test(compound_mapcount_ptr(page)); + if (first) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_add_return_relaxed(COMPOUND_MAPPED, mapped); + if (likely(nr < COMPOUND_MAPPED + COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of a remove and another add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* Raced ahead of a remove of COMPOUND_MAPPED */ + nr = 0; + } } - if (atomic_inc_and_test(&page->_mapcount)) - nr++; } -out: + + if (nr_pmdmapped) + __mod_lruvec_page_state(page, PageSwapBacked(page) ? + NR_SHMEM_PMDMAPPED : NR_FILE_PMDMAPPED, nr_pmdmapped); if (nr) __mod_lruvec_page_state(page, NR_FILE_MAPPED, nr); unlock_page_memcg(page); @@ -1333,132 +1370,87 @@ out: mlock_vma_page(page, vma, compound); } -static void page_remove_file_rmap(struct page *page, bool compound) +/** + * page_remove_rmap - take down pte mapping from a page + * @page: page to remove mapping from + * @vma: the vm area from which the mapping is removed + * @compound: uncharge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_remove_rmap(struct page *page, + struct vm_area_struct *vma, bool compound) { - int i, nr = 0; + atomic_t *mapped; + int nr = 0, nr_pmdmapped = 0; + bool last; VM_BUG_ON_PAGE(compound && !PageHead(page), page); - /* Hugepages are not counted in NR_FILE_MAPPED for now. */ + /* Hugetlb pages are not counted in NR_*MAPPED */ if (unlikely(PageHuge(page))) { /* hugetlb pages are always mapped with pmds */ atomic_dec(compound_mapcount_ptr(page)); return; } - /* page still mapped by someone else? */ - if (compound && PageTransHuge(page)) { - int nr_pages = thp_nr_pages(page); + lock_page_memcg(page); - for (i = 0; i < nr_pages; i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; + /* Is page being unmapped by PTE? Is this its last map to be removed? */ + if (likely(!compound)) { + last = atomic_add_negative(-1, &page->_mapcount); + nr = last; + if (last && PageCompound(page)) { + mapped = subpages_mapcount_ptr(compound_head(page)); + nr = atomic_dec_return_relaxed(mapped); + nr = (nr < COMPOUND_MAPPED); } - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - goto out; - if (PageSwapBacked(page)) - __mod_lruvec_page_state(page, NR_SHMEM_PMDMAPPED, - -nr_pages); - else - __mod_lruvec_page_state(page, NR_FILE_PMDMAPPED, - -nr_pages); - } else { - if (atomic_add_negative(-1, &page->_mapcount)) - nr++; - } -out: - if (nr) - __mod_lruvec_page_state(page, NR_FILE_MAPPED, -nr); -} - -static void page_remove_anon_compound_rmap(struct page *page) -{ - int i, nr; - - if (!atomic_add_negative(-1, compound_mapcount_ptr(page))) - return; - - /* Hugepages are not counted in NR_ANON_PAGES for now. */ - if (unlikely(PageHuge(page))) - return; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) - return; - - __mod_lruvec_page_state(page, NR_ANON_THPS, -thp_nr_pages(page)); - - if (TestClearPageDoubleMap(page)) { - /* - * Subpages can be mapped with PTEs too. Check how many of - * them are still mapped. - */ - for (i = 0, nr = 0; i < thp_nr_pages(page); i++) { - if (atomic_add_negative(-1, &page[i]._mapcount)) - nr++; + } else if (PageTransHuge(page)) { + /* That test is redundant: it's for safety or to optimize out */ + + last = atomic_add_negative(-1, compound_mapcount_ptr(page)); + if (last) { + mapped = subpages_mapcount_ptr(page); + nr = atomic_sub_return_relaxed(COMPOUND_MAPPED, mapped); + if (likely(nr < COMPOUND_MAPPED)) { + nr_pmdmapped = thp_nr_pages(page); + nr = nr_pmdmapped - (nr & SUBPAGES_MAPPED); + /* Raced ahead of another remove and an add? */ + if (unlikely(nr < 0)) + nr = 0; + } else { + /* An add of COMPOUND_MAPPED raced ahead */ + nr = 0; + } } + } + if (nr_pmdmapped) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_THPS : + (PageSwapBacked(page) ? NR_SHMEM_PMDMAPPED : + NR_FILE_PMDMAPPED), -nr_pmdmapped); + } + if (nr) { + __mod_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : + NR_FILE_MAPPED, -nr); /* - * Queue the page for deferred split if at least one small + * Queue anon THP for deferred split if at least one small * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < thp_nr_pages(page)) - deferred_split_huge_page(page); - } else { - nr = thp_nr_pages(page); - } - - if (nr) - __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr); -} - -/** - * page_remove_rmap - take down pte mapping from a page - * @page: page to remove mapping from - * @vma: the vm area from which the mapping is removed - * @compound: uncharge the page as compound or small page - * - * The caller needs to hold the pte lock. - */ -void page_remove_rmap(struct page *page, - struct vm_area_struct *vma, bool compound) -{ - lock_page_memcg(page); - - if (!PageAnon(page)) { - page_remove_file_rmap(page, compound); - goto out; - } - - if (compound) { - page_remove_anon_compound_rmap(page); - goto out; + if (PageTransCompound(page) && PageAnon(page)) + if (!compound || nr < nr_pmdmapped) + deferred_split_huge_page(compound_head(page)); } - /* page still mapped by someone else? */ - if (!atomic_add_negative(-1, &page->_mapcount)) - goto out; - /* - * We use the irq-unsafe __{inc|mod}_zone_page_stat because - * these counters are not modified in interrupt context, and - * pte lock(a spinlock) is held, which implies preemption disabled. - */ - __dec_lruvec_page_state(page, NR_ANON_MAPPED); - - if (PageTransCompound(page)) - deferred_split_huge_page(compound_head(page)); - - /* - * It would be tidy to reset the PageAnon mapping here, + * It would be tidy to reset PageAnon mapping when fully unmapped, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_unref_page, + * before us: so leave the reset to free_pages_prepare, * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. */ -out: + unlock_page_memcg(page); munlock_vma_page(page, vma, compound); @@ -1801,7 +1793,7 @@ static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg) return vma_is_temporary_stack(vma); } -static int page_not_mapped(struct folio *folio) +static int folio_not_mapped(struct folio *folio) { return !folio_mapped(folio); } @@ -1822,7 +1814,7 @@ void try_to_unmap(struct folio *folio, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, .arg = (void *)flags, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; @@ -2150,7 +2142,7 @@ void try_to_migrate(struct folio *folio, enum ttu_flags flags) struct rmap_walk_control rwc = { .rmap_one = try_to_migrate_one, .arg = (void *)flags, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, }; @@ -2297,7 +2289,7 @@ static bool folio_make_device_exclusive(struct folio *folio, }; struct rmap_walk_control rwc = { .rmap_one = page_make_device_exclusive_one, - .done = page_not_mapped, + .done = folio_not_mapped, .anon_lock = folio_lock_anon_vma_read, .arg = &args, }; @@ -2569,9 +2561,9 @@ void hugepage_add_new_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* increment count (starts at -1) */ atomic_set(compound_mapcount_ptr(page), 0); - atomic_set(compound_pincount_ptr(page), 0); - + ClearHPageRestoreReserve(page); __page_set_anon_rmap(page, vma, address, 1); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c301487be5fb 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -237,11 +237,17 @@ static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; static const struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_anon_vm_ops; static struct file_system_type shmem_fs_type; +bool vma_is_anon_shmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &shmem_anon_vm_ops; +} + bool vma_is_shmem(struct vm_area_struct *vma) { - return vma->vm_ops == &shmem_vm_ops; + return vma_is_anon_shmem(vma) || vma->vm_ops == &shmem_vm_ops; } static LIST_HEAD(shmem_swaplist); @@ -922,21 +928,18 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_init(&fbatch); index = start; - while (index < end && find_lock_entries(mapping, index, end - 1, + while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; - index = indices[i]; - if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, folio); + indices[i], folio); continue; } - index += folio_nr_pages(folio) - 1; if (!unfalloc || !folio_test_uptodate(folio)) truncate_inode_folio(mapping, folio); @@ -945,9 +948,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } + /* + * When undoing a failed fallocate, we want none of the partial folio + * zeroing and splitting below, but shall want to truncate the whole + * folio when !uptodate indicates that it was added by this fallocate, + * even when [lstart, lend] covers only a part of the folio. + */ + if (unfalloc) + goto whole_folios; + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); if (folio) { @@ -973,11 +984,13 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, folio_put(folio); } +whole_folios: + index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &fbatch, + if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) @@ -989,13 +1002,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, for (i = 0; i < folio_batch_count(&fbatch); i++) { folio = fbatch.folios[i]; - index = indices[i]; if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, folio)) { + if (shmem_free_swap(mapping, indices[i], folio)) { /* Swap was replaced by page: retry */ - index--; + index = indices[i]; break; } nr_swaps_freed++; @@ -1008,19 +1020,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ folio_unlock(folio); - index--; + index = indices[i]; break; } VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); truncate_inode_folio(mapping, folio); } - index = folio->index + folio_nr_pages(folio) - 1; folio_unlock(folio); } folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); - index++; } spin_lock_irq(&info->lock); @@ -1121,7 +1131,7 @@ static int shmem_setattr(struct user_namespace *mnt_userns, setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, dentry, inode->i_mode); if (!error && update_ctime) { inode->i_ctime = current_time(inode); if (update_mtime) @@ -1689,7 +1699,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swp_entry_t swapin_error; void *old; - swapin_error = make_swapin_error_entry(&folio->page); + swapin_error = make_swapin_error_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); @@ -1833,7 +1843,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct folio *folio; - pgoff_t hindex = index; + pgoff_t hindex; gfp_t huge_gfp; int error; int once = 0; @@ -1871,7 +1881,6 @@ repeat: } if (folio) { - hindex = folio->index; if (sgp == SGP_WRITE) folio_mark_accessed(folio); if (folio_test_uptodate(folio)) @@ -2271,7 +2280,8 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { - struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + struct inode *inode = file_inode(file); + struct shmem_inode_info *info = SHMEM_I(inode); int ret; ret = seal_check_future_write(info->seals, vma); @@ -2282,7 +2292,11 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); - vma->vm_ops = &shmem_vm_ops; + /* This is anonymous shared memory if it is unlinked at the time of mmap */ + if (inode->i_nlink) + vma->vm_ops = &shmem_vm_ops; + else + vma->vm_ops = &shmem_anon_vm_ops; return 0; } @@ -2424,9 +2438,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ @@ -3255,7 +3286,7 @@ static int shmem_initxattrs(struct inode *inode, memcpy(new_xattr->name + XATTR_SECURITY_PREFIX_LEN, xattr->name, len); - simple_xattr_list_add(&info->xattrs, new_xattr); + simple_xattr_add(&info->xattrs, new_xattr); } return 0; @@ -3893,6 +3924,7 @@ EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, + .open = generic_file_open, .get_unmapped_area = shmem_get_unmapped_area, #ifdef CONFIG_TMPFS .llseek = shmem_file_llseek, @@ -3978,6 +4010,15 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; +static const struct vm_operations_struct shmem_anon_vm_ops = { + .fault = shmem_fault, + .map_pages = filemap_map_pages, +#ifdef CONFIG_NUMA + .set_policy = shmem_set_policy, + .get_policy = shmem_get_policy, +#endif +}; + int shmem_init_fs_context(struct fs_context *fc) { struct shmem_options *ctx; @@ -4153,6 +4194,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) EXPORT_SYMBOL_GPL(shmem_truncate_range); #define shmem_vm_ops generic_file_vm_ops +#define shmem_anon_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 @@ -4258,7 +4300,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; - vma->vm_ops = &shmem_vm_ops; + vma->vm_ops = &shmem_anon_vm_ops; return 0; } diff --git a/mm/slab.c b/mm/slab.c index 59c8e28f7b6a..7a269db050ee 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; - spin_lock_init(&parent->list_lock); + raw_spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; } @@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, slab_node = slab_nid(slab); n = get_node(cachep, slab_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, slab_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } @@ -684,7 +684,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); /* * Stuff objects into the remote nodes shared array first. * That way we could avoid the overhead of putting the objects @@ -695,7 +695,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); } } @@ -768,9 +768,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, slabs_destroy(cachep, &list); } else { n = get_node(cachep, slab_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, slab_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } return 1; @@ -811,10 +811,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) */ n = get_node(cachep, node); if (n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); return 0; } @@ -893,7 +893,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, goto fail; n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); if (n->shared && force_change) { free_block(cachep, n->shared->entry, n->shared->avail, node, &list); @@ -911,7 +911,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, new_alien = NULL; } - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); /* @@ -950,7 +950,7 @@ static void cpuup_canceled(long cpu) if (!n) continue; - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; @@ -961,7 +961,7 @@ static void cpuup_canceled(long cpu) nc->avail = 0; if (!cpumask_empty(mask)) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto free_slab; } @@ -975,7 +975,7 @@ static void cpuup_canceled(long cpu) alien = n->alien; n->alien = NULL; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); kfree(shared); if (alien) { @@ -1159,7 +1159,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node * /* * Do not assume that spinlocks can be initialized via memcpy: */ - spin_lock_init(&ptr->list_lock); + raw_spin_lock_init(&ptr->list_lock); MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->node[nodeid] = ptr; @@ -1330,11 +1330,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long total_slabs, free_slabs, free_objs; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); total_slabs = n->total_slabs; free_slabs = n->free_slabs; free_objs = n->free_objects; - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", node, total_slabs - free_slabs, total_slabs, @@ -1370,6 +1370,8 @@ static struct slab *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, account_slab(slab, cachep->gfporder, cachep, flags); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (sk_memalloc_socks() && page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab); @@ -1387,9 +1389,11 @@ static void kmem_freepages(struct kmem_cache *cachep, struct slab *slab) BUG_ON(!folio_test_slab(folio)); __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); page_mapcount_reset(folio_page(folio, 0)); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; @@ -2096,7 +2100,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); + assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2104,7 +2108,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, node)->list_lock); + assert_raw_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2144,9 +2148,9 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail = 0; slabs_destroy(cachep, &list); } @@ -2164,9 +2168,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) drain_alien_cache(cachep, n->alien); for_each_kmem_cache_node(cachep, node, n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, n->shared, node, true, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -2188,10 +2192,10 @@ static int drain_freelist(struct kmem_cache *cache, nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); p = n->slabs_free.prev; if (p == &n->slabs_free) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto out; } @@ -2204,7 +2208,7 @@ static int drain_freelist(struct kmem_cache *cache, * to the cache. */ n->free_objects -= cache->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slab_destroy(cache, slab); nr_freed++; } @@ -2629,7 +2633,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) INIT_LIST_HEAD(&slab->slab_list); n = get_node(cachep, slab_nid(slab)); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); n->total_slabs++; if (!slab->active) { list_add_tail(&slab->slab_list, &n->slabs_free); @@ -2639,7 +2643,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct slab *slab) STATS_INC_GROWN(cachep); n->free_objects += cachep->num - slab->active; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); } @@ -2805,7 +2809,7 @@ static struct slab *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct slab *slab; - assert_spin_locked(&n->list_lock); + assert_raw_spin_locked(&n->list_lock); slab = list_first_entry_or_null(&n->slabs_partial, struct slab, slab_list); if (!slab) { @@ -2832,10 +2836,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, if (!gfp_pfmemalloc_allowed(flags)) return NULL; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); slab = get_first_slab(n, true); if (!slab) { - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); return NULL; } @@ -2844,7 +2848,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, fixup_slab_list(cachep, n, slab, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; @@ -2903,7 +2907,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) if (!n->free_objects && (!shared || !shared->avail)) goto direct_grow; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ @@ -2927,7 +2931,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) must_grow: n->free_objects -= ac->avail; alloc_done: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); direct_grow: @@ -3147,7 +3151,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, BUG_ON(!n); check_irq_off(); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); slab = get_first_slab(n, false); if (!slab) goto must_grow; @@ -3165,12 +3169,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, fixup_slab_list(cachep, n, slab, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; must_grow: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slab = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); if (slab) { /* This slab isn't counted yet so don't update free_objects */ @@ -3254,7 +3258,8 @@ slab_alloc_node(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags, init = slab_want_init_on_alloc(flags, cachep); out: - slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init); + slab_post_alloc_hook(cachep, objcg, flags, 1, &objp, init, + cachep->object_size); return objp; } @@ -3325,7 +3330,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) check_irq_off(); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; @@ -3354,7 +3359,7 @@ free_done: STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); slabs_destroy(cachep, &list); @@ -3446,16 +3451,6 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return ret; } -/** - * kmem_cache_alloc - Allocate an object - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * - * Allocate an object from this cache. The flags are only relevant - * if the cache has no available objects. - * - * Return: pointer to the new object or %NULL in case of error - */ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) { return __kmem_cache_alloc_lru(cachep, NULL, flags); @@ -3507,13 +3502,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * Done outside of the IRQ disabled section. */ slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); + slab_want_init_on_alloc(flags, s), s->object_size); /* FIXME: Trace call missing. Christoph would like a bulk variant */ return size; error: local_irq_enable(); cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; } @@ -3721,9 +3716,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, node = cpu_to_mem(cpu); n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } free_percpu(prev); @@ -3815,9 +3810,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, return; } - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, ac, node, false, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -3901,7 +3896,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); total_slabs += n->total_slabs; free_slabs += n->free_slabs; @@ -3910,7 +3905,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) if (n->shared) shared_avail += n->shared->avail; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); } num_objs = total_slabs * cachep->num; active_slabs = total_slabs - free_slabs; diff --git a/mm/slab.h b/mm/slab.h index 0202a8c2f0d2..7cc432969945 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -11,37 +11,43 @@ struct slab { #if defined(CONFIG_SLAB) + struct kmem_cache *slab_cache; union { - struct list_head slab_list; + struct { + struct list_head slab_list; + void *freelist; /* array of free object indexes */ + void *s_mem; /* first object */ + }; struct rcu_head rcu_head; }; - struct kmem_cache *slab_cache; - void *freelist; /* array of free object indexes */ - void *s_mem; /* first object */ unsigned int active; #elif defined(CONFIG_SLUB) - union { - struct list_head slab_list; - struct rcu_head rcu_head; -#ifdef CONFIG_SLUB_CPU_PARTIAL - struct { - struct slab *next; - int slabs; /* Nr of slabs left */ - }; -#endif - }; struct kmem_cache *slab_cache; - /* Double-word boundary */ - void *freelist; /* first free object */ union { - unsigned long counters; struct { - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; + union { + struct list_head slab_list; +#ifdef CONFIG_SLUB_CPU_PARTIAL + struct { + struct slab *next; + int slabs; /* Nr of slabs left */ + }; +#endif + }; + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + unsigned long counters; + struct { + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; }; + struct rcu_head rcu_head; }; unsigned int __unused; @@ -66,9 +72,10 @@ struct slab { #define SLAB_MATCH(pg, sl) \ static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl)) SLAB_MATCH(flags, __page_flags); -SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ #ifndef CONFIG_SLOB -SLAB_MATCH(rcu_head, rcu_head); +SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ +#else +SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */ #endif SLAB_MATCH(_refcount, __page_refcount); #ifdef CONFIG_MEMCG @@ -76,6 +83,9 @@ SLAB_MATCH(memcg_data, memcg_data); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && defined(CONFIG_SLUB) +static_assert(IS_ALIGNED(offsetof(struct slab, freelist), 2*sizeof(void *))); +#endif /** * folio_slab - Converts from folio to slab. @@ -207,8 +217,6 @@ struct kmem_cache { unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ slab_flags_t flags; /* Active flags on the slab */ - unsigned int useroffset;/* Usercopy region offset */ - unsigned int usersize; /* Usercopy region size */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -336,7 +344,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS) + SLAB_TEMPORARY | SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS | SLAB_KMALLOC) #else #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif @@ -356,6 +365,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ SLAB_ACCOUNT | \ + SLAB_KMALLOC | \ SLAB_NO_USER_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); @@ -720,13 +730,27 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, static inline void slab_post_alloc_hook(struct kmem_cache *s, struct obj_cgroup *objcg, gfp_t flags, - size_t size, void **p, bool init) + size_t size, void **p, bool init, + unsigned int orig_size) { + unsigned int zero_size = s->object_size; size_t i; flags &= gfp_allowed_mask; /* + * For kmalloc object, the allocated memory size(object_size) is likely + * larger than the requested size(orig_size). If redzone check is + * enabled for the extra space, don't zero it, as it will be redzoned + * soon. The redzone operation for this extra space could be seen as a + * replacement of current poisoning under certain debug option, and + * won't break other sanity checks. + */ + if (kmem_cache_debug_flags(s, SLAB_STORE_USER | SLAB_RED_ZONE) && + (s->flags & SLAB_KMALLOC)) + zero_size = orig_size; + + /* * As memory initialization might be integrated into KASAN, * kasan_slab_alloc and initialization memset must be * kept together to avoid discrepancies in behavior. @@ -736,7 +760,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, for (i = 0; i < size; i++) { p[i] = kasan_slab_alloc(s, p[i], flags, init); if (p[i] && init && !kasan_has_integrated_init()) - memset(p[i], 0, s->object_size); + memset(p[i], 0, zero_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); kmsan_slab_alloc(s, p[i], flags); @@ -750,9 +774,8 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { - spinlock_t list_lock; - #ifdef CONFIG_SLAB + raw_spinlock_t list_lock; struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; @@ -768,6 +791,7 @@ struct kmem_cache_node { #endif #ifdef CONFIG_SLUB + spinlock_t list_lock; unsigned long nr_partial; struct list_head partial; #ifdef CONFIG_SLUB_DEBUG @@ -871,4 +895,8 @@ void __check_heap_object(const void *ptr, unsigned long n, } #endif +#ifdef CONFIG_SLUB_DEBUG +void skip_orig_size_check(struct kmem_cache *s, const void *object); +#endif + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 33b1886b06eb..1cba98acc486 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -143,8 +143,10 @@ int slab_unmergeable(struct kmem_cache *s) if (s->ctor) return 1; +#ifdef CONFIG_HARDENED_USERCOPY if (s->usersize) return 1; +#endif /* * We may have set a slab to be unmergeable during bootstrap. @@ -223,8 +225,10 @@ static struct kmem_cache *create_cache(const char *name, s->size = s->object_size = object_size; s->align = align; s->ctor = ctor; +#ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; +#endif err = __kmem_cache_create(s, flags); if (err) @@ -317,7 +321,8 @@ kmem_cache_create_usercopy(const char *name, flags &= CACHE_CREATE_MASK; /* Fail closed on bad usersize of useroffset values. */ - if (WARN_ON(!usersize && useroffset) || + if (!IS_ENABLED(CONFIG_HARDENED_USERCOPY) || + WARN_ON(!usersize && useroffset) || WARN_ON(size < usersize || size - usersize < useroffset)) usersize = useroffset = 0; @@ -595,8 +600,8 @@ void kmem_dump_obj(void *object) ptroffset = ((char *)object - (char *)kp.kp_objp) - kp.kp_data_offset; pr_cont(" pointer offset %lu", ptroffset); } - if (kp.kp_slab_cache && kp.kp_slab_cache->usersize) - pr_cont(" size %u", kp.kp_slab_cache->usersize); + if (kp.kp_slab_cache && kp.kp_slab_cache->object_size) + pr_cont(" size %u", kp.kp_slab_cache->object_size); if (kp.kp_ret) pr_cont(" allocated at %pS\n", kp.kp_ret); else @@ -640,8 +645,10 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, align = max(align, size); s->align = calculate_alignment(flags, align, size); +#ifdef CONFIG_HARDENED_USERCOPY s->useroffset = useroffset; s->usersize = usersize; +#endif err = __kmem_cache_create(s, flags); @@ -766,10 +773,16 @@ EXPORT_SYMBOL(kmalloc_size_roundup); #define KMALLOC_CGROUP_NAME(sz) #endif +#ifndef CONFIG_SLUB_TINY +#define KMALLOC_RCL_NAME(sz) .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #sz, +#else +#define KMALLOC_RCL_NAME(sz) +#endif + #define INIT_KMALLOC_INFO(__size, __short_size) \ { \ .name[KMALLOC_NORMAL] = "kmalloc-" #__short_size, \ - .name[KMALLOC_RECLAIM] = "kmalloc-rcl-" #__short_size, \ + KMALLOC_RCL_NAME(__short_size) \ KMALLOC_CGROUP_NAME(__short_size) \ KMALLOC_DMA_NAME(__short_size) \ .size = __size, \ @@ -855,7 +868,7 @@ void __init setup_kmalloc_cache_index_table(void) static void __init new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) { - if (type == KMALLOC_RECLAIM) { + if ((KMALLOC_RECLAIM != KMALLOC_NORMAL) && (type == KMALLOC_RECLAIM)) { flags |= SLAB_RECLAIM_ACCOUNT; } else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) { if (mem_cgroup_kmem_disabled()) { @@ -941,7 +954,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { ret = __kmalloc_large_node(size, flags, node); - trace_kmalloc(_RET_IP_, ret, size, + trace_kmalloc(caller, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; } @@ -953,7 +966,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller ret = __kmem_cache_alloc_node(s, flags, node, size, caller); ret = kasan_kmalloc(s, ret, size, flags); - trace_kmalloc(_RET_IP_, ret, size, s->size, flags, node); + trace_kmalloc(caller, ret, size, s->size, flags, node); return ret; } @@ -1010,7 +1023,7 @@ EXPORT_SYMBOL(kfree); /** * __ksize -- Report full size of underlying allocation - * @objp: pointer to the object + * @object: pointer to the object * * This should only be used internally to query the true size of allocations. * It is not meant to be a way to discover the usable size of an allocation @@ -1018,7 +1031,7 @@ EXPORT_SYMBOL(kfree); * the originally requested allocation size may trigger KASAN, UBSAN_BOUNDS, * and/or FORTIFY_SOURCE. * - * Return: size of the actual memory used by @objp in bytes + * Return: size of the actual memory used by @object in bytes */ size_t __ksize(const void *object) { @@ -1037,10 +1050,13 @@ size_t __ksize(const void *object) return folio_size(folio); } +#ifdef CONFIG_SLUB_DEBUG + skip_orig_size_check(folio_slab(folio)->slab_cache, object); +#endif + return slab_ksize(folio_slab(folio)->slab_cache); } -#ifdef CONFIG_TRACING void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, @@ -1064,7 +1080,6 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, return ret; } EXPORT_SYMBOL(kmalloc_node_trace); -#endif /* !CONFIG_TRACING */ #endif /* !CONFIG_SLOB */ gfp_t kmalloc_fix_flags(gfp_t flags) @@ -1333,11 +1348,11 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) void *ret; size_t ks; - /* Don't use instrumented ksize to allow precise KASAN poisoning. */ + /* Check for double-free before calling ksize. */ if (likely(!ZERO_OR_NULL_PTR(p))) { if (!kasan_check_byte(p)) return NULL; - ks = kfence_ksize(p) ?: __ksize(p); + ks = ksize(p); } else ks = 0; @@ -1405,35 +1420,21 @@ void kfree_sensitive(const void *p) void *mem = (void *)p; ks = ksize(mem); - if (ks) + if (ks) { + kasan_unpoison_range(mem, ks); memzero_explicit(mem, ks); + } kfree(mem); } EXPORT_SYMBOL(kfree_sensitive); -/** - * ksize - get the actual amount of memory allocated for a given object - * @objp: Pointer to the object - * - * kmalloc may internally round up allocations and return more memory - * than requested. ksize() can be used to determine the actual amount of - * memory allocated. The caller may use this additional memory, even though - * a smaller amount of memory was initially specified with the kmalloc call. - * The caller must guarantee that objp points to a valid object previously - * allocated with either kmalloc() or kmem_cache_alloc(). The object - * must not be freed during the duration of the call. - * - * Return: size of the actual memory used by @objp in bytes - */ size_t ksize(const void *objp) { - size_t size; - /* - * We need to first check that the pointer to the object is valid, and - * only then unpoison the memory. The report printed from ksize() is - * more useful, then when it's printed later when the behaviour could - * be undefined due to a potential use-after-free or double-free. + * We need to first check that the pointer to the object is valid. + * The KASAN report printed from ksize() is more useful, then when + * it's printed later when the behaviour could be undefined due to + * a potential use-after-free or double-free. * * We use kasan_check_byte(), which is supported for the hardware * tag-based KASAN mode, unlike kasan_check_read/write(). @@ -1447,13 +1448,7 @@ size_t ksize(const void *objp) if (unlikely(ZERO_OR_NULL_PTR(objp)) || !kasan_check_byte(objp)) return 0; - size = kfence_ksize(objp) ?: __ksize(objp); - /* - * We assume that ksize callers could use whole allocated area, - * so we need to unpoison this area. - */ - kasan_unpoison_range(objp, size); - return size; + return kfence_ksize(objp) ?: __ksize(objp); } EXPORT_SYMBOL(ksize); diff --git a/mm/slub.c b/mm/slub.c index 157527d7101b..13459c69095a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -39,6 +39,7 @@ #include <linux/memcontrol.h> #include <linux/random.h> #include <kunit/test.h> +#include <kunit/test-bug.h> #include <linux/sort.h> #include <linux/debugfs.h> @@ -187,6 +188,12 @@ do { \ #define USE_LOCKLESS_FAST_PATH() (false) #endif +#ifndef CONFIG_SLUB_TINY +#define __fastpath_inline __always_inline +#else +#define __fastpath_inline +#endif + #ifdef CONFIG_SLUB_DEBUG #ifdef CONFIG_SLUB_DEBUG_ON DEFINE_STATIC_KEY_TRUE(slub_debug_enabled); @@ -241,6 +248,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Enable to log cmpxchg failures */ #undef SLUB_DEBUG_CMPXCHG +#ifndef CONFIG_SLUB_TINY /* * Minimum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -253,6 +261,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 +#else +#define MIN_PARTIAL 0 +#define MAX_PARTIAL 0 +#endif #define DEBUG_DEFAULT_FLAGS (SLAB_CONSISTENCY_CHECKS | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -298,7 +310,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); #else @@ -332,10 +344,12 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) */ static nodemask_t slab_nodes; +#ifndef CONFIG_SLUB_TINY /* * Workqueue used for flush_cpu_slab(). */ static struct workqueue_struct *flushwq; +#endif /******************************************************************** * Core slab cache functions @@ -381,10 +395,12 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return freelist_dereference(s, object + s->offset); } +#ifndef CONFIG_SLUB_TINY static void prefetch_freepointer(const struct kmem_cache *s, void *object) { prefetchw(object + s->offset); } +#endif /* * When running under KMSAN, get_freepointer_safe() may return an uninitialized @@ -603,7 +619,7 @@ static bool slab_add_kunit_errors(void) { struct kunit_resource *resource; - if (likely(!current->kunit_test)) + if (!kunit_get_current_test()) return false; resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); @@ -829,6 +845,17 @@ static inline void set_orig_size(struct kmem_cache *s, if (!slub_debug_orig_size(s)) return; +#ifdef CONFIG_KASAN_GENERIC + /* + * KASAN could save its free meta data in object's data area at + * offset 0, if the size is larger than 'orig_size', it will + * overlap the data redzone in [orig_size+1, object_size], and + * the check should be skipped. + */ + if (kasan_metadata_size(s, true) > orig_size) + orig_size = s->object_size; +#endif + p += get_info_end(s); p += sizeof(struct track) * 2; @@ -848,6 +875,11 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object) return *(unsigned int *)p; } +void skip_orig_size_check(struct kmem_cache *s, const void *object) +{ + set_orig_size(s, (void *)object, s->object_size); +} + static void slab_bug(struct kmem_cache *s, char *fmt, ...) { struct va_format vaf; @@ -910,7 +942,7 @@ static void print_trailer(struct kmem_cache *s, struct slab *slab, u8 *p) if (slub_debug_orig_size(s)) off += sizeof(unsigned int); - off += kasan_metadata_size(s); + off += kasan_metadata_size(s, false); if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ @@ -966,17 +998,28 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct slab *slab, static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = kasan_reset_tag(object); + unsigned int poison_size = s->object_size; - if (s->flags & SLAB_RED_ZONE) + if (s->flags & SLAB_RED_ZONE) { memset(p - s->red_left_pad, val, s->red_left_pad); + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + /* + * Redzone the extra allocated space by kmalloc than + * requested, and the poison size will be limited to + * the original request size accordingly. + */ + poison_size = get_orig_size(s, object); + } + } + if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->object_size - 1); - p[s->object_size - 1] = POISON_END; + memset(p, POISON_FREE, poison_size - 1); + p[poison_size - 1] = POISON_END; } if (s->flags & SLAB_RED_ZONE) - memset(p + s->object_size, val, s->inuse - s->object_size); + memset(p + poison_size, val, s->inuse - poison_size); } static void restore_bytes(struct kmem_cache *s, char *message, u8 data, @@ -1070,7 +1113,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) off += sizeof(unsigned int); } - off += kasan_metadata_size(s); + off += kasan_metadata_size(s, false); if (size_from_object(s) == off) return 1; @@ -1120,6 +1163,7 @@ static int check_object(struct kmem_cache *s, struct slab *slab, { u8 *p = object; u8 *endobject = object + s->object_size; + unsigned int orig_size; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", @@ -1129,6 +1173,17 @@ static int check_object(struct kmem_cache *s, struct slab *slab, if (!check_bytes_and_report(s, slab, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; + + if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { + orig_size = get_orig_size(s, object); + + if (s->object_size > orig_size && + !check_bytes_and_report(s, slab, object, + "kmalloc Redzone", p + orig_size, + val, s->object_size - orig_size)) { + return 0; + } + } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, slab, p, "Alignment padding", @@ -1363,7 +1418,7 @@ static inline int alloc_consistency_checks(struct kmem_cache *s, return 1; } -static noinline int alloc_debug_processing(struct kmem_cache *s, +static noinline bool alloc_debug_processing(struct kmem_cache *s, struct slab *slab, void *object, int orig_size) { if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@ -1375,7 +1430,7 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, trace(s, slab, object, 1); set_orig_size(s, object, orig_size); init_object(s, object, SLUB_RED_ACTIVE); - return 1; + return true; bad: if (folio_test_slab(slab_folio(slab))) { @@ -1388,7 +1443,7 @@ bad: slab->inuse = slab->objects; slab->freelist = NULL; } - return 0; + return false; } static inline int free_consistency_checks(struct kmem_cache *s, @@ -1641,17 +1696,17 @@ static inline void setup_object_debug(struct kmem_cache *s, void *object) {} static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} -static inline int alloc_debug_processing(struct kmem_cache *s, - struct slab *slab, void *object, int orig_size) { return 0; } +static inline bool alloc_debug_processing(struct kmem_cache *s, + struct slab *slab, void *object, int orig_size) { return true; } -static inline void free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) {} +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { return true; } static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } +static inline depot_stack_handle_t set_track_prepare(void) { return 0; } static inline void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) {} static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1676,11 +1731,13 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#ifndef CONFIG_SLUB_TINY static bool freelist_corrupted(struct kmem_cache *s, struct slab *slab, void **freelist, void *nextfree) { return false; } +#endif #endif /* CONFIG_SLUB_DEBUG */ /* @@ -1800,6 +1857,8 @@ static inline struct slab *alloc_slab_page(gfp_t flags, int node, slab = folio_slab(folio); __folio_set_slab(folio); + /* Make the flag visible before any changes to folio->mapping */ + smp_wmb(); if (page_is_pfmemalloc(folio_page(folio, 0))) slab_set_pfmemalloc(slab); @@ -1881,7 +1940,7 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) return false; freelist_count = oo_objects(s->oo); - pos = prandom_u32_max(freelist_count); + pos = get_random_u32_below(freelist_count); page_limit = slab->objects * s->size; start = fixup_red_left(s, slab_address(slab)); @@ -1999,17 +2058,11 @@ static void __free_slab(struct kmem_cache *s, struct slab *slab) int order = folio_order(folio); int pages = 1 << order; - if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { - void *p; - - slab_pad_check(s, slab); - for_each_object(p, s, slab_address(slab), slab->objects) - check_object(s, slab, p, SLUB_RED_INACTIVE); - } - __slab_clear_pfmemalloc(slab); - __folio_clear_slab(folio); folio->mapping = NULL; + /* Make the mapping reset visible before clearing the flag */ + smp_wmb(); + __folio_clear_slab(folio); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; unaccount_slab(slab, order, s); @@ -2025,9 +2078,17 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct slab *slab) { - if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { + if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) { + void *p; + + slab_pad_check(s, slab); + for_each_object(p, s, slab_address(slab), slab->objects) + check_object(s, slab, p, SLUB_RED_INACTIVE); + } + + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&slab->rcu_head, rcu_free_slab); - } else + else __free_slab(s, slab); } @@ -2214,7 +2275,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!pfmemalloc_match(slab, pc->flags)) continue; - if (kmem_cache_debug(s)) { + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { object = alloc_single_from_partial(s, n, slab, pc->orig_size); if (object) @@ -2329,6 +2390,8 @@ static void *get_partial(struct kmem_cache *s, int node, struct partial_context return get_any_partial(s, pc); } +#ifndef CONFIG_SLUB_TINY + #ifdef CONFIG_PREEMPTION /* * Calculate the next globally unique transaction for disambiguation @@ -2342,7 +2405,7 @@ static void *get_partial(struct kmem_cache *s, int node, struct partial_context * different cpus. */ #define TID_STEP 1 -#endif +#endif /* CONFIG_PREEMPTION */ static inline unsigned long next_tid(unsigned long tid) { @@ -2411,7 +2474,7 @@ static void init_kmem_cache_cpus(struct kmem_cache *s) static void deactivate_slab(struct kmem_cache *s, struct slab *slab, void *freelist) { - enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE, M_FULL_NOLIST }; + enum slab_modes { M_NONE, M_PARTIAL, M_FREE, M_FULL_NOLIST }; struct kmem_cache_node *n = get_node(s, slab_nid(slab)); int free_delta = 0; enum slab_modes mode = M_NONE; @@ -2487,14 +2550,6 @@ redo: * acquire_slab() will see a slab that is frozen */ spin_lock_irqsave(&n->list_lock, flags); - } else if (kmem_cache_debug_flags(s, SLAB_STORE_USER)) { - mode = M_FULL; - /* - * This also ensures that the scanning of full - * slabs from diagnostic functions will not see - * any frozen slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); } else { mode = M_FULL_NOLIST; } @@ -2504,7 +2559,7 @@ redo: old.freelist, old.counters, new.freelist, new.counters, "unfreezing slab")) { - if (mode == M_PARTIAL || mode == M_FULL) + if (mode == M_PARTIAL) spin_unlock_irqrestore(&n->list_lock, flags); goto redo; } @@ -2518,10 +2573,6 @@ redo: stat(s, DEACTIVATE_EMPTY); discard_slab(s, slab); stat(s, FREE_SLAB); - } else if (mode == M_FULL) { - add_full(s, n, slab); - spin_unlock_irqrestore(&n->list_lock, flags); - stat(s, DEACTIVATE_FULL); } else if (mode == M_FULL_NOLIST) { stat(s, DEACTIVATE_FULL); } @@ -2803,6 +2854,13 @@ static int slub_cpu_dead(unsigned int cpu) return 0; } +#else /* CONFIG_SLUB_TINY */ +static inline void flush_all_cpus_locked(struct kmem_cache *s) { } +static inline void flush_all(struct kmem_cache *s) { } +static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { } +static inline int slub_cpu_dead(unsigned int cpu) { return 0; } +#endif /* CONFIG_SLUB_TINY */ + /* * Check if the objects in a per cpu structure fit numa * locality expectations. @@ -2828,38 +2886,28 @@ static inline unsigned long node_nr_objs(struct kmem_cache_node *n) } /* Supports checking bulk free of a constructed freelist */ -static noinline void free_debug_processing( - struct kmem_cache *s, struct slab *slab, - void *head, void *tail, int bulk_cnt, - unsigned long addr) +static inline bool free_debug_processing(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, int *bulk_cnt, + unsigned long addr, depot_stack_handle_t handle) { - struct kmem_cache_node *n = get_node(s, slab_nid(slab)); - struct slab *slab_free = NULL; + bool checks_ok = false; void *object = head; int cnt = 0; - unsigned long flags; - bool checks_ok = false; - depot_stack_handle_t handle = 0; - - if (s->flags & SLAB_STORE_USER) - handle = set_track_prepare(); - - spin_lock_irqsave(&n->list_lock, flags); if (s->flags & SLAB_CONSISTENCY_CHECKS) { if (!check_slab(s, slab)) goto out; } - if (slab->inuse < bulk_cnt) { + if (slab->inuse < *bulk_cnt) { slab_err(s, slab, "Slab has %d allocated objects but %d are to be freed\n", - slab->inuse, bulk_cnt); + slab->inuse, *bulk_cnt); goto out; } next_object: - if (++cnt > bulk_cnt) + if (++cnt > *bulk_cnt) goto out_cnt; if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@ -2881,61 +2929,22 @@ next_object: checks_ok = true; out_cnt: - if (cnt != bulk_cnt) + if (cnt != *bulk_cnt) { slab_err(s, slab, "Bulk free expected %d objects but found %d\n", - bulk_cnt, cnt); - -out: - if (checks_ok) { - void *prior = slab->freelist; - - /* Perform the actual freeing while we still hold the locks */ - slab->inuse -= cnt; - set_freepointer(s, tail, prior); - slab->freelist = head; - - /* - * If the slab is empty, and node's partial list is full, - * it should be discarded anyway no matter it's on full or - * partial list. - */ - if (slab->inuse == 0 && n->nr_partial >= s->min_partial) - slab_free = slab; - - if (!prior) { - /* was on full list */ - remove_full(s, n, slab); - if (!slab_free) { - add_partial(n, slab, DEACTIVATE_TO_TAIL); - stat(s, FREE_ADD_PARTIAL); - } - } else if (slab_free) { - remove_partial(n, slab); - stat(s, FREE_REMOVE_PARTIAL); - } - } - - if (slab_free) { - /* - * Update the counters while still holding n->list_lock to - * prevent spurious validation warnings - */ - dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + *bulk_cnt, cnt); + *bulk_cnt = cnt; } - spin_unlock_irqrestore(&n->list_lock, flags); +out: if (!checks_ok) slab_fix(s, "Object at 0x%p not freed", object); - if (slab_free) { - stat(s, FREE_SLAB); - free_slab(s, slab_free); - } + return checks_ok; } #endif /* CONFIG_SLUB_DEBUG */ -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct slab *)) { @@ -2949,12 +2958,12 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } -#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ +#ifdef CONFIG_SLUB_DEBUG static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { -#ifdef CONFIG_SLUB_DEBUG static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; @@ -2985,8 +2994,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } -#endif } +#else /* CONFIG_SLUB_DEBUG */ +static inline void +slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { } +#endif static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) { @@ -2996,6 +3008,7 @@ static inline bool pfmemalloc_match(struct slab *slab, gfp_t gfpflags) return true; } +#ifndef CONFIG_SLUB_TINY /* * Check the slab->freelist and either transfer the freelist to the * per cpu freelist or deactivate the slab. @@ -3283,45 +3296,13 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, return p; } -/* - * If the object has been wiped upon free, make sure it's fully initialized by - * zeroing out freelist pointer. - */ -static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, - void *obj) -{ - if (unlikely(slab_want_init_on_free(s)) && obj) - memset((void *)((char *)kasan_reset_tag(obj) + s->offset), - 0, sizeof(void *)); -} - -/* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. - * - * Otherwise we can simply pick the next object from the lockless free list. - */ -static __always_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, +static __always_inline void *__slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) { - void *object; struct kmem_cache_cpu *c; struct slab *slab; unsigned long tid; - struct obj_cgroup *objcg = NULL; - bool init = false; - - s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); - if (!s) - return NULL; - - object = kfence_alloc(s, orig_size, gfpflags); - if (unlikely(object)) - goto out; + void *object; redo: /* @@ -3391,22 +3372,95 @@ redo: stat(s, ALLOC_FASTPATH); } + return object; +} +#else /* CONFIG_SLUB_TINY */ +static void *__slab_alloc_node(struct kmem_cache *s, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + struct partial_context pc; + struct slab *slab; + void *object; + + pc.flags = gfpflags; + pc.slab = &slab; + pc.orig_size = orig_size; + object = get_partial(s, node, &pc); + + if (object) + return object; + + slab = new_slab(s, gfpflags, node); + if (unlikely(!slab)) { + slab_out_of_memory(s, gfpflags, node); + return NULL; + } + + object = alloc_single_from_new_slab(s, slab, orig_size); + + return object; +} +#endif /* CONFIG_SLUB_TINY */ + +/* + * If the object has been wiped upon free, make sure it's fully initialized by + * zeroing out freelist pointer. + */ +static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s, + void *obj) +{ + if (unlikely(slab_want_init_on_free(s)) && obj) + memset((void *)((char *)kasan_reset_tag(obj) + s->offset), + 0, sizeof(void *)); +} + +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static __fastpath_inline void *slab_alloc_node(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) +{ + void *object; + struct obj_cgroup *objcg = NULL; + bool init = false; + + s = slab_pre_alloc_hook(s, lru, &objcg, 1, gfpflags); + if (!s) + return NULL; + + object = kfence_alloc(s, orig_size, gfpflags); + if (unlikely(object)) + goto out; + + object = __slab_alloc_node(s, gfpflags, node, addr, orig_size); + maybe_wipe_obj_freeptr(s, object); init = slab_want_init_on_alloc(gfpflags, s); out: - slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init); + /* + * When init equals 'true', like for kzalloc() family, only + * @orig_size bytes might be zeroed instead of s->object_size + */ + slab_post_alloc_hook(s, objcg, gfpflags, 1, &object, init, orig_size); return object; } -static __always_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, +static __fastpath_inline void *slab_alloc(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags, unsigned long addr, size_t orig_size) { return slab_alloc_node(s, lru, gfpflags, NUMA_NO_NODE, addr, orig_size); } -static __always_inline +static __fastpath_inline void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { @@ -3448,6 +3502,67 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) } EXPORT_SYMBOL(kmem_cache_alloc_node); +static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, + void *head, void *tail, int bulk_cnt, + unsigned long addr) +{ + struct kmem_cache_node *n = get_node(s, slab_nid(slab)); + struct slab *slab_free = NULL; + int cnt = bulk_cnt; + unsigned long flags; + depot_stack_handle_t handle = 0; + + if (s->flags & SLAB_STORE_USER) + handle = set_track_prepare(); + + spin_lock_irqsave(&n->list_lock, flags); + + if (free_debug_processing(s, slab, head, tail, &cnt, addr, handle)) { + void *prior = slab->freelist; + + /* Perform the actual freeing while we still hold the locks */ + slab->inuse -= cnt; + set_freepointer(s, tail, prior); + slab->freelist = head; + + /* + * If the slab is empty, and node's partial list is full, + * it should be discarded anyway no matter it's on full or + * partial list. + */ + if (slab->inuse == 0 && n->nr_partial >= s->min_partial) + slab_free = slab; + + if (!prior) { + /* was on full list */ + remove_full(s, n, slab); + if (!slab_free) { + add_partial(n, slab, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } + } else if (slab_free) { + remove_partial(n, slab); + stat(s, FREE_REMOVE_PARTIAL); + } + } + + if (slab_free) { + /* + * Update the counters while still holding n->list_lock to + * prevent spurious validation warnings + */ + dec_slabs_node(s, slab_nid(slab_free), slab_free->objects); + } + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (slab_free) { + stat(s, FREE_SLAB); + free_slab(s, slab_free); + } +} + /* * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. @@ -3473,8 +3588,8 @@ static void __slab_free(struct kmem_cache *s, struct slab *slab, if (kfence_free(head)) return; - if (kmem_cache_debug(s)) { - free_debug_processing(s, slab, head, tail, cnt, addr); + if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) { + free_to_partial_list(s, slab, head, tail, cnt, addr); return; } @@ -3574,6 +3689,7 @@ slab_empty: discard_slab(s, slab); } +#ifndef CONFIG_SLUB_TINY /* * Fastpath with forced inlining to produce a kfree and kmem_cache_free that * can perform fastpath freeing without additional function calls. @@ -3648,8 +3764,18 @@ redo: } stat(s, FREE_FASTPATH); } +#else /* CONFIG_SLUB_TINY */ +static void do_slab_free(struct kmem_cache *s, + struct slab *slab, void *head, void *tail, + int cnt, unsigned long addr) +{ + void *tail_obj = tail ? : head; + + __slab_free(s, slab, head, tail_obj, cnt, addr); +} +#endif /* CONFIG_SLUB_TINY */ -static __always_inline void slab_free(struct kmem_cache *s, struct slab *slab, +static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, void *head, void *tail, void **p, int cnt, unsigned long addr) { @@ -3782,18 +3908,13 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) } EXPORT_SYMBOL(kmem_cache_free_bulk); -/* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +#ifndef CONFIG_SLUB_TINY +static inline int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) { struct kmem_cache_cpu *c; int i; - struct obj_cgroup *objcg = NULL; - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); - if (unlikely(!s)) - return false; /* * Drain objects in the per cpu slab, while disabling local * IRQs, which protects against PREEMPT and interrupts @@ -3847,18 +3968,71 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, local_unlock_irq(&s->cpu_slab->lock); slub_put_cpu_ptr(s->cpu_slab); - /* - * memcg and kmem_cache debug support and memory initialization. - * Done outside of the IRQ disabled fastpath loop. - */ - slab_post_alloc_hook(s, objcg, flags, size, p, - slab_want_init_on_alloc(flags, s)); return i; + error: slub_put_cpu_ptr(s->cpu_slab); - slab_post_alloc_hook(s, objcg, flags, i, p, false); + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); kmem_cache_free_bulk(s, i, p); return 0; + +} +#else /* CONFIG_SLUB_TINY */ +static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + size_t size, void **p, struct obj_cgroup *objcg) +{ + int i; + + for (i = 0; i < size; i++) { + void *object = kfence_alloc(s, s->object_size, flags); + + if (unlikely(object)) { + p[i] = object; + continue; + } + + p[i] = __slab_alloc_node(s, flags, NUMA_NO_NODE, + _RET_IP_, s->object_size); + if (unlikely(!p[i])) + goto error; + + maybe_wipe_obj_freeptr(s, p[i]); + } + + return i; + +error: + slab_post_alloc_hook(s, objcg, flags, i, p, false, s->object_size); + kmem_cache_free_bulk(s, i, p); + return 0; +} +#endif /* CONFIG_SLUB_TINY */ + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + int i; + struct obj_cgroup *objcg = NULL; + + if (!size) + return 0; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, NULL, &objcg, size, flags); + if (unlikely(!s)) + return 0; + + i = __kmem_cache_alloc_bulk(s, flags, size, p, objcg); + + /* + * memcg and kmem_cache debug support and memory initialization. + * Done outside of the IRQ disabled fastpath loop. + */ + if (i != 0) + slab_post_alloc_hook(s, objcg, flags, size, p, + slab_want_init_on_alloc(flags, s), s->object_size); + return i; } EXPORT_SYMBOL(kmem_cache_alloc_bulk); @@ -3883,7 +4057,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * take the list_lock. */ static unsigned int slub_min_order; -static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static unsigned int slub_max_order = + IS_ENABLED(CONFIG_SLUB_TINY) ? 1 : PAGE_ALLOC_COSTLY_ORDER; static unsigned int slub_min_objects; /* @@ -4014,10 +4189,12 @@ init_kmem_cache_node(struct kmem_cache_node *n) #endif } +#ifndef CONFIG_SLUB_TINY static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < - KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu)); + NR_KMALLOC_TYPES * KMALLOC_SHIFT_HIGH * + sizeof(struct kmem_cache_cpu)); /* * Must align to double word boundary for the double cmpxchg @@ -4033,6 +4210,12 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) return 1; } +#else +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) +{ + return 1; +} +#endif /* CONFIG_SLUB_TINY */ static struct kmem_cache *kmem_cache_node; @@ -4095,7 +4278,9 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) void __kmem_cache_release(struct kmem_cache *s) { cache_random_seq_destroy(s); +#ifndef CONFIG_SLUB_TINY free_percpu(s->cpu_slab); +#endif free_kmem_cache_nodes(s); } @@ -4202,7 +4387,8 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + if (slub_debug_orig_size(s) || + (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || s->ctor) { /* @@ -4771,11 +4957,6 @@ static int slab_memory_callback(struct notifier_block *self, return ret; } -static struct notifier_block slab_memory_callback_nb = { - .notifier_call = slab_memory_callback, - .priority = SLAB_CALLBACK_PRI, -}; - /******************************************************************** * Basic setup of slabs *******************************************************************/ @@ -4841,7 +5022,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache_node, "kmem_cache_node", sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); - register_hotmemory_notifier(&slab_memory_callback_nb); + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); /* Able to allocate the per node structures */ slab_state = PARTIAL; @@ -4872,8 +5053,10 @@ void __init kmem_cache_init(void) void __init kmem_cache_init_late(void) { +#ifndef CONFIG_SLUB_TINY flushwq = alloc_workqueue("slub_flushwq", WQ_MEM_RECLAIM, 0); WARN_ON(!flushwq); +#endif } struct kmem_cache * @@ -4924,7 +5107,7 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) return 0; } -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct slab *slab) { return slab->inuse; @@ -5182,7 +5365,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, #endif /* CONFIG_DEBUG_FS */ #endif /* CONFIG_SLUB_DEBUG */ -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -5502,11 +5685,13 @@ static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) SLAB_ATTR_RO(cache_dma); #endif +#ifdef CONFIG_HARDENED_USERCOPY static ssize_t usersize_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%u\n", s->usersize); } SLAB_ATTR_RO(usersize); +#endif static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { @@ -5586,7 +5771,21 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf) { return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); } -SLAB_ATTR_RO(failslab); + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + if (s->refcount > 1) + return -EINVAL; + + if (buf[0] == '1') + WRITE_ONCE(s->flags, s->flags | SLAB_FAILSLAB); + else + WRITE_ONCE(s->flags, s->flags & ~SLAB_FAILSLAB); + + return length; +} +SLAB_ATTR(failslab); #endif static ssize_t shrink_show(struct kmem_cache *s, char *buf) @@ -5803,7 +6002,9 @@ static struct attribute *slab_attrs[] = { #ifdef CONFIG_FAILSLAB &failslab_attr.attr, #endif +#ifdef CONFIG_HARDENED_USERCOPY &usersize_attr.attr, +#endif #ifdef CONFIG_KFENCE &skip_kfence_attr.attr, #endif @@ -5920,11 +6121,6 @@ static int sysfs_slab_add(struct kmem_cache *s) struct kset *kset = cache_kset(s); int unmergeable = slab_unmergeable(s); - if (!kset) { - kobject_init(&s->kobj, &slab_ktype); - return 0; - } - if (!unmergeable && disable_higher_order_debug && (slub_debug & DEBUG_METADATA_FLAGS)) unmergeable = 1; @@ -6054,9 +6250,8 @@ static int __init slab_sysfs_init(void) mutex_unlock(&slab_mutex); return 0; } - -__initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ +late_initcall(slab_sysfs_init); +#endif /* SLAB_SUPPORTS_SYSFS */ #if defined(CONFIG_SLUB_DEBUG) && defined(CONFIG_DEBUG_FS) static int slab_debugfs_show(struct seq_file *seq, void *v) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 46ae542118c0..c5398a5960d0 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -196,6 +196,10 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } +void __weak __meminit pmd_init(void *addr) +{ +} + pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); @@ -203,11 +207,16 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pmd_init(p); pud_populate(&init_mm, pud, p); } return pud; } +void __weak __meminit pud_init(void *addr) +{ +} + p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); @@ -215,6 +224,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; + pud_init(p); p4d_populate(&init_mm, p4d, p); } return p4d; @@ -285,6 +295,69 @@ int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end, return vmemmap_populate_range(start, end, node, altmap, NULL); } +void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node, + unsigned long addr, unsigned long next) +{ +} + +int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node, + unsigned long addr, unsigned long next) +{ + return 0; +} + +int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = vmemmap_pgd_populate(addr, node); + if (!pgd) + return -ENOMEM; + + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * No fallback: In any case we care about, the + * altmap should be reasonably sized and aligned + * such that vmemmap_alloc_block_buf() will always + * succeed. For consistency with the PTE case, + * return an error here as failure could indicate + * a configuration issue with the size of the altmap. + */ + return -ENOMEM; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) + continue; + if (vmemmap_populate_basepages(addr, next, node, altmap)) + return -ENOMEM; + } + return 0; +} + /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail diff --git a/mm/sparse.c b/mm/sparse.c index e5a8a3a0edd7..2779b419ef2a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -926,8 +926,6 @@ void sparse_remove_section(struct mem_section *ms, unsigned long pfn, unsigned long nr_pages, unsigned long map_offset, struct vmem_altmap *altmap) { - clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset, - nr_pages - map_offset); section_deactivate(pfn, nr_pages, altmap); } #endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/mm/swap.c b/mm/swap.c index 955930f41d20..70e2063ef43a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -43,8 +43,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/pagemap.h> -/* How many pages do we try to swap or page in/out together? */ +/* How many pages do we try to swap or page in/out together? As a power of 2 */ int page_cluster; +const int page_cluster_max = 31; /* Protecting only lru_rotate.fbatch which requires disabling interrupts */ struct lru_rotate { @@ -295,8 +296,20 @@ void folio_rotate_reclaimable(struct folio *folio) } } -void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) +void lru_note_cost(struct lruvec *lruvec, bool file, + unsigned int nr_io, unsigned int nr_rotated) { + unsigned long cost; + + /* + * Reflect the relative cost of incurring IO and spending CPU + * time on rotations. This doesn't attempt to make a precise + * comparison, it just says: if reloads are about comparable + * between the LRU lists, or rotations are overwhelmingly + * different between them, adjust scan balance for CPU work. + */ + cost = nr_io * SWAP_CLUSTER_MAX + nr_rotated; + do { unsigned long lrusize; @@ -310,9 +323,9 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) spin_lock_irq(&lruvec->lru_lock); /* Record cost event */ if (file) - lruvec->file_cost += nr_pages; + lruvec->file_cost += cost; else - lruvec->anon_cost += nr_pages; + lruvec->anon_cost += cost; /* * Decay previous events @@ -335,10 +348,10 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages) } while ((lruvec = parent_lruvec(lruvec))); } -void lru_note_cost_folio(struct folio *folio) +void lru_note_cost_refault(struct folio *folio) { lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), - folio_nr_pages(folio)); + folio_nr_pages(folio), 0); } static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio) @@ -968,22 +981,30 @@ void lru_cache_disable(void) /** * release_pages - batched put_page() - * @pages: array of pages to release + * @arg: array of pages to release * @nr: number of pages * - * Decrement the reference count on all the pages in @pages. If it + * Decrement the reference count on all the pages in @arg. If it * fell to zero, remove the page from the LRU and free it. + * + * Note that the argument can be an array of pages, encoded pages, + * or folio pointers. We ignore any encoded bits, and turn any of + * them into just a folio that gets free'd. */ -void release_pages(struct page **pages, int nr) +void release_pages(release_pages_arg arg, int nr) { int i; + struct encoded_page **encoded = arg.encoded_pages; LIST_HEAD(pages_to_free); struct lruvec *lruvec = NULL; unsigned long flags = 0; unsigned int lock_batch; for (i = 0; i < nr; i++) { - struct folio *folio = page_folio(pages[i]); + struct folio *folio; + + /* Turn any of the argument types into a folio */ + folio = page_folio(encoded_page_ptr(encoded[i])); /* * Make sure the IRQ-safe lock-holding time does not get diff --git a/mm/swap.h b/mm/swap.h index cc08c459c619..f78065c8ef52 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,7 +41,8 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, @@ -105,9 +106,10 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, } static inline -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { - return find_get_page(mapping, index); + return filemap_get_folio(mapping, index); } static inline bool add_to_swap(struct folio *folio) diff --git a/mm/swap_state.c b/mm/swap_state.c index 438d0676c5be..2927507b43d8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -303,15 +303,12 @@ void free_page_and_swap_cache(struct page *page) * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ -void free_pages_and_swap_cache(struct page **pages, int nr) +void free_pages_and_swap_cache(struct encoded_page **pages, int nr) { - struct page **pagep = pages; - int i; - lru_add_drain(); - for (i = 0; i < nr; i++) - free_swap_cache(pagep[i]); - release_pages(pagep, nr); + for (int i = 0; i < nr; i++) + free_swap_cache(encoded_page_ptr(pages[i])); + release_pages(pages, nr); } static inline bool swap_use_vma_readahead(void) @@ -373,30 +370,28 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, } /** - * find_get_incore_page - Find and get a page from the page or swap caches. + * filemap_get_incore_folio - Find and get a folio from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * - * This differs from find_get_page() in that it will also look for the - * page in the swap cache. + * This differs from filemap_get_folio() in that it will also look for the + * folio in the swap cache. * - * Return: The found page or %NULL. + * Return: The found folio or %NULL. */ -struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) +struct folio *filemap_get_incore_folio(struct address_space *mapping, + pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; - struct page *page = pagecache_get_page(mapping, index, - FGP_ENTRY | FGP_HEAD, 0); + struct folio *folio = __filemap_get_folio(mapping, index, FGP_ENTRY, 0); - if (!page) - return page; - if (!xa_is_value(page)) - return find_subpage(page, index); + if (!xa_is_value(folio)) + goto out; if (!shmem_mapping(mapping)) return NULL; - swp = radix_to_swp_entry(page); + swp = radix_to_swp_entry(folio); /* There might be swapin error entries in shmem mapping. */ if (non_swap_entry(swp)) return NULL; @@ -404,9 +399,11 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) si = get_swap_device(swp); if (!si) return NULL; - page = find_get_page(swap_address_space(swp), swp_offset(swp)); + index = swp_offset(swp); + folio = filemap_get_folio(swap_address_space(swp), index); put_swap_device(si); - return page; +out: + return folio; } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, diff --git a/mm/swapfile.c b/mm/swapfile.c index 5fc1237a9f21..908a529bca12 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -772,8 +772,7 @@ static void set_cluster_next(struct swap_info_struct *si, unsigned long next) /* No free swap slots available */ if (si->highest_bit <= si->lowest_bit) return; - next = si->lowest_bit + - prandom_u32_max(si->highest_bit - si->lowest_bit + 1); + next = get_random_u32_inclusive(si->lowest_bit, si->highest_bit); next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES); next = max_t(unsigned int, next, si->lowest_bit); } @@ -973,23 +972,23 @@ done: scan: spin_unlock(&si->lock); while (++offset <= READ_ONCE(si->highest_bit)) { - if (swap_offset_available_and_locked(si, offset)) - goto checks; if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } + if (swap_offset_available_and_locked(si, offset)) + goto checks; } offset = si->lowest_bit; while (offset < scan_base) { - if (swap_offset_available_and_locked(si, offset)) - goto checks; if (unlikely(--latency_ration < 0)) { cond_resched(); latency_ration = LATENCY_LIMIT; scanned_many = true; } + if (swap_offset_available_and_locked(si, offset)) + goto checks; offset++; } spin_lock(&si->lock); @@ -1781,7 +1780,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte_t pteval; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - pteval = swp_entry_to_pte(make_swapin_error_entry(page)); + pteval = swp_entry_to_pte(make_swapin_error_entry()); set_pte_at(vma->vm_mm, addr, pte, pteval); swap_free(entry); ret = 0; @@ -3089,7 +3088,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) */ for_each_possible_cpu(cpu) { per_cpu(*p->cluster_next_cpu, cpu) = - 1 + prandom_u32_max(p->highest_bit); + get_random_u32_inclusive(1, p->highest_bit); } nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); diff --git a/mm/truncate.c b/mm/truncate.c index c0be77e5c008..7b4ea4c4a46b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -361,9 +361,8 @@ void truncate_inode_pages_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; - while (index < end && find_lock_entries(mapping, index, end - 1, + while (index < end && find_lock_entries(mapping, &index, end - 1, &fbatch, indices)) { - index = indices[folio_batch_count(&fbatch) - 1] + 1; truncate_folio_batch_exceptionals(mapping, &fbatch, indices); for (i = 0; i < folio_batch_count(&fbatch); i++) truncate_cleanup_folio(fbatch.folios[i]); @@ -401,7 +400,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = start; while (index < end) { cond_resched(); - if (!find_get_entries(mapping, index, end - 1, &fbatch, + if (!find_get_entries(mapping, &index, end - 1, &fbatch, indices)) { /* If all gone from start onwards, we're done */ if (index == start) @@ -415,21 +414,18 @@ void truncate_inode_pages_range(struct address_space *mapping, struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing page->index */ - index = indices[i]; if (xa_is_value(folio)) continue; folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); folio_wait_writeback(folio); truncate_inode_folio(mapping, folio); folio_unlock(folio); - index = folio_index(folio) + folio_nr_pages(folio) - 1; } truncate_folio_batch_exceptionals(mapping, &fbatch, indices); folio_batch_release(&fbatch); - index++; } } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -510,20 +506,17 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, int i; folio_batch_init(&fbatch); - while (find_lock_entries(mapping, index, end, &fbatch, indices)) { + while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ - index = indices[i]; if (xa_is_value(folio)) { count += invalidate_exceptional_entry(mapping, - index, - folio); + indices[i], folio); continue; } - index += folio_nr_pages(folio) - 1; ret = mapping_evict_folio(mapping, folio); folio_unlock(folio); @@ -542,7 +535,6 @@ unsigned long invalidate_mapping_pagevec(struct address_space *mapping, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } return count; } @@ -573,7 +565,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave pages behind because * shrink_page_list() has a temp ref on them, or because they're transiently - * sitting in the lru_cache_add() pagevecs. + * sitting in the folio_add_lru() pagevecs. */ static int invalidate_complete_folio2(struct address_space *mapping, struct folio *folio) @@ -641,16 +633,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; - while (find_get_entries(mapping, index, end, &fbatch, indices)) { + while (find_get_entries(mapping, &index, end, &fbatch, indices)) { for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ - index = indices[i]; if (xa_is_value(folio)) { if (!invalidate_exceptional_entry2(mapping, - index, folio)) + indices[i], folio)) ret = -EBUSY; continue; } @@ -660,13 +651,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * If folio is mapped, before taking its lock, * zap the rest of the file in one hit. */ - unmap_mapping_pages(mapping, index, - (1 + end - index), false); + unmap_mapping_pages(mapping, indices[i], + (1 + end - indices[i]), false); did_range_unmap = 1; } folio_lock(folio); - VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio); + VM_BUG_ON_FOLIO(!folio_contains(folio, indices[i]), folio); if (folio->mapping != mapping) { folio_unlock(folio); continue; @@ -689,7 +680,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); cond_resched(); - index++; } /* * For DAX we invalidate page tables after invalidating page cache. We diff --git a/mm/usercopy.c b/mm/usercopy.c index c1ee15a98633..4c3164beacec 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/kstrtox.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/task.h> @@ -258,7 +259,7 @@ static bool enable_checks __initdata = true; static int __init parse_hardened_usercopy(char *str) { - if (strtobool(str, &enable_checks)) + if (kstrtobool(str, &enable_checks)) pr_warn("Invalid option string for hardened_usercopy: '%s'\n", str); return 1; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..0499907b6f1a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -64,8 +64,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; bool writable = dst_vma->vm_flags & VM_WRITE; bool vm_shared = dst_vma->vm_flags & VM_SHARED; - bool page_in_cache = page->mapping; + bool page_in_cache = page_mapping(page); spinlock_t *ptl; + struct folio *folio; struct inode *inode; pgoff_t offset, max_off; @@ -113,14 +114,15 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, if (!pte_none_mostly(*dst_pte)) goto out_unlock; + folio = page_folio(page); if (page_in_cache) { /* Usually, cache pages are already added to LRU */ if (newly_allocated) - lru_cache_add(page); + folio_add_lru(folio); page_add_file_rmap(page, dst_vma, false); } else { page_add_new_anon_rmap(page, dst_vma, dst_addr); - lru_cache_add_inactive_or_unevictable(page, dst_vma); + folio_add_lru_vma(folio, dst_vma); } /* @@ -157,11 +159,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -613,7 +632,7 @@ retry: break; } - dst_pmdval = pmd_read_atomic(dst_pmd); + dst_pmdval = pmdp_get_lockless(dst_pmd); /* * If the dst_pmd is mapped as THP don't * override it and just be strict. @@ -646,11 +665,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out; diff --git a/mm/util.c b/mm/util.c index 12984e76767e..b56c92fb910f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,32 +717,6 @@ void *page_rmapping(struct page *page) return folio_raw_mapping(page_folio(page)); } -/** - * folio_mapped - Is this folio mapped into userspace? - * @folio: The folio. - * - * Return: True if any page in this folio is referenced by user page tables. - */ -bool folio_mapped(struct folio *folio) -{ - long i, nr; - - if (!folio_test_large(folio)) - return atomic_read(&folio->_mapcount) >= 0; - if (atomic_read(folio_mapcount_ptr(folio)) >= 0) - return true; - if (folio_test_hugetlb(folio)) - return false; - - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) { - if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0) - return true; - } - return false; -} -EXPORT_SYMBOL(folio_mapped); - struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; @@ -783,59 +757,6 @@ struct address_space *folio_mapping(struct folio *folio) } EXPORT_SYMBOL(folio_mapping); -/* Slow path of page_mapcount() for compound pages */ -int __page_mapcount(struct page *page) -{ - int ret; - - ret = atomic_read(&page->_mapcount) + 1; - /* - * For file THP page->_mapcount contains total number of mapping - * of the page: no need to look into compound_mapcount. - */ - if (!PageAnon(page) && !PageHuge(page)) - return ret; - page = compound_head(page); - ret += atomic_read(compound_mapcount_ptr(page)) + 1; - if (PageDoubleMap(page)) - ret--; - return ret; -} -EXPORT_SYMBOL_GPL(__page_mapcount); - -/** - * folio_mapcount() - Calculate the number of mappings of this folio. - * @folio: The folio. - * - * A large folio tracks both how many times the entire folio is mapped, - * and how many times each individual page in the folio is mapped. - * This function calculates the total number of times the folio is - * mapped. - * - * Return: The number of times this folio is mapped. - */ -int folio_mapcount(struct folio *folio) -{ - int i, compound, nr, ret; - - if (likely(!folio_test_large(folio))) - return atomic_read(&folio->_mapcount) + 1; - - compound = folio_entire_mapcount(folio); - if (folio_test_hugetlb(folio)) - return compound; - ret = compound; - nr = folio_nr_pages(folio); - for (i = 0; i < nr; i++) - ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; - /* File pages has compound_mapcount included in _mapcount */ - if (!folio_test_anon(folio)) - return ret - compound * nr; - if (folio_test_double_map(folio)) - ret -= nr; - return ret; -} - /** * folio_copy - Copy the contents of one folio to another. * @dst: Folio to copy to. diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ccaa461998f3..ca71de7c9d77 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -43,6 +43,9 @@ #include <asm/tlbflush.h> #include <asm/shmparam.h> +#define CREATE_TRACE_POINTS +#include <trace/events/vmalloc.h> + #include "internal.h" #include "pgalloc-track.h" @@ -1620,6 +1623,8 @@ retry: size, align, vstart, vend); spin_unlock(&free_vmap_area_lock); + trace_alloc_vmap_area(addr, size, align, vstart, vend, addr == vend); + /* * If an allocation fails, the "vend" address is * returned. Therefore trigger the overflow path. @@ -1725,6 +1730,7 @@ static void purge_fragmented_blocks_allcpus(void); static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { unsigned long resched_threshold; + unsigned int num_purged_areas = 0; struct list_head local_purge_list; struct vmap_area *va, *n_va; @@ -1736,7 +1742,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) spin_unlock(&purge_vmap_area_lock); if (unlikely(list_empty(&local_purge_list))) - return false; + goto out; start = min(start, list_first_entry(&local_purge_list, @@ -1771,12 +1777,16 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) va->va_start, va->va_end); atomic_long_sub(nr, &vmap_lazy_nr); + num_purged_areas++; if (atomic_long_read(&vmap_lazy_nr) < resched_threshold) cond_resched_lock(&free_vmap_area_lock); } spin_unlock(&free_vmap_area_lock); - return true; + +out: + trace_purge_vmap_area_lazy(start, end, num_purged_areas); + return num_purged_areas > 0; } /* @@ -1811,6 +1821,8 @@ static void drain_vmap_area_work(struct work_struct *work) */ static void free_vmap_area_noflush(struct vmap_area *va) { + unsigned long nr_lazy_max = lazy_max_pages(); + unsigned long va_start = va->va_start; unsigned long nr_lazy; spin_lock(&vmap_area_lock); @@ -1828,8 +1840,10 @@ static void free_vmap_area_noflush(struct vmap_area *va) &purge_vmap_area_root, &purge_vmap_area_list); spin_unlock(&purge_vmap_area_lock); + trace_free_vmap_area_noflush(va_start, nr_lazy, nr_lazy_max); + /* After this point, we may free va at any time */ - if (unlikely(nr_lazy > lazy_max_pages())) + if (unlikely(nr_lazy > nr_lazy_max)) schedule_work(&drain_vmap_work); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 04d8b88e5216..bd6637fcd8f9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -54,6 +54,7 @@ #include <linux/shmem_fs.h> #include <linux/ctype.h> #include <linux/debugfs.h> +#include <linux/khugepaged.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -1020,31 +1021,52 @@ out: return freed; } -static void drop_slab_node(int nid) +static unsigned long drop_slab_node(int nid) { - unsigned long freed; - int shift = 0; + unsigned long freed = 0; + struct mem_cgroup *memcg = NULL; + memcg = mem_cgroup_iter(NULL, NULL, NULL); do { - struct mem_cgroup *memcg = NULL; + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - if (fatal_signal_pending(current)) - return; + return freed; +} +void drop_slab(void) +{ + int nid; + int shift = 0; + unsigned long freed; + + do { freed = 0; - memcg = mem_cgroup_iter(NULL, NULL, NULL); - do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); - } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + for_each_online_node(nid) { + if (fatal_signal_pending(current)) + return; + + freed += drop_slab_node(nid); + } } while ((freed >> shift++) > 1); } -void drop_slab(void) +static int reclaimer_offset(void) { - int nid; + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); - for_each_online_node(nid) - drop_slab_node(nid); + if (current_is_kswapd()) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } static inline int is_page_cache_freeable(struct folio *folio) @@ -1346,11 +1368,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (folio_test_swapcache(folio)) { swp_entry_t swap = folio_swap_entry(folio); - /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); - mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); + mem_cgroup_swapout(folio, swap); xa_unlock_irq(&mapping->i_pages); put_swap_folio(folio, swap); } else { @@ -1599,10 +1620,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); - if (current_is_kswapd()) - __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded); - else - __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded); + __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded); return nr_succeeded; } @@ -2069,10 +2087,29 @@ keep: nr_reclaimed += demote_folio_list(&demote_folios, pgdat); /* Folios that could not be demoted are still in @demote_folios */ if (!list_empty(&demote_folios)) { - /* Folios which weren't demoted go back on @folio_list for retry: */ + /* Folios which weren't demoted go back on @folio_list */ list_splice_init(&demote_folios, folio_list); - do_demote_pass = false; - goto retry; + + /* + * goto retry to reclaim the undemoted folios in folio_list if + * desired. + * + * Reclaiming directly from top tier nodes is not often desired + * due to it breaking the LRU ordering: in general memory + * should be reclaimed from lower tier nodes and demoted from + * top tier nodes. + * + * However, disabling reclaim from top tier nodes entirely + * would cause ooms in edge scenarios where lower tier memory + * is unreclaimable for whatever reason, eg memory being + * mlocked or too hot to reclaim. We can disable reclaim + * from top tier nodes in proactive reclaim though as that is + * not real memory pressure. + */ + if (!sc->proactive) { + do_demote_pass = false; + goto retry; + } } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; @@ -2475,7 +2512,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); @@ -2492,14 +2529,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); - lru_note_cost(lruvec, file, stat.nr_pageout); + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); mem_cgroup_uncharge_list(&folio_list); free_unref_page_list(&folio_list); @@ -2514,8 +2551,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. */ - if (stat.nr_unqueued_dirty == nr_taken) + if (stat.nr_unqueued_dirty == nr_taken) { wakeup_flusher_threads(WB_REASON_VMSCAN); + /* + * For cgroupv1 dirty throttling is achieved by waking up + * the kernel flusher here and later waiting on folios + * which are in writeback to finish (see shrink_folio_list()). + * + * Flusher may not be able to issue writeback quickly + * enough for cgroupv1 writeback throttling to work + * on a large system. + */ + if (!writeback_throttling_sane(sc)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } sc->nr.dirty += stat.nr_dirty; sc->nr.congested += stat.nr_congested; @@ -2639,6 +2688,8 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&lruvec->lru_lock); + if (nr_rotated) + lru_note_cost(lruvec, file, 0, nr_rotated); mem_cgroup_uncharge_list(&l_active); free_unref_page_list(&l_active); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, @@ -3133,7 +3184,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) if (memcg) { struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; - /* for hotadd_new_pgdat() */ + /* see the comment in mem_cgroup_lruvec() */ if (!lruvec->pgdat) lruvec->pgdat = pgdat; @@ -3142,7 +3193,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid) #endif VM_WARN_ON_ONCE(!mem_cgroup_disabled()); - return pgdat ? &pgdat->__lruvec : NULL; + return &pgdat->__lruvec; } static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) @@ -3206,9 +3257,6 @@ void lru_gen_add_mm(struct mm_struct *mm) for_each_node_state(nid, N_MEMORY) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* the first addition since the last iteration */ if (lruvec->mm_state.tail == &mm_list->fifo) lruvec->mm_state.tail = &mm->lru_gen.list; @@ -3238,9 +3286,6 @@ void lru_gen_del_mm(struct mm_struct *mm) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - /* where the last iteration ended (exclusive) */ if (lruvec->mm_state.tail == &mm->lru_gen.list) lruvec->mm_state.tail = lruvec->mm_state.tail->next; @@ -3975,7 +4020,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area goto next; if (!pmd_trans_huge(pmd[i])) { - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) pmdp_test_and_clear_young(vma, addr, pmd + i); goto next; @@ -4039,10 +4084,7 @@ restart: /* walk_pte_range() may call get_next_vma() */ vma = args->vma; for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { - pmd_t val = pmd_read_atomic(pmd + i); - - /* for pmd_read_atomic() */ - barrier(); + pmd_t val = pmdp_get_lockless(pmd + i); next = pmd_addr_end(addr, end); @@ -4073,14 +4115,14 @@ restart: #endif walk->mm_stats[MM_NONLEAF_TOTAL]++; -#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG - if (get_cap(LRU_GEN_NONLEAF_YOUNG)) { + if (arch_has_hw_nonleaf_pmd_young() && + get_cap(LRU_GEN_NONLEAF_YOUNG)) { if (!pmd_young(val)) continue; walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos); } -#endif + if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i)) continue; @@ -4486,7 +4528,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned mem_cgroup_calculate_protection(NULL, memcg); - if (mem_cgroup_below_min(memcg)) + if (mem_cgroup_below_min(NULL, memcg)) return false; need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); @@ -4857,7 +4899,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, break; } - item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + item = PGSCAN_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); @@ -4971,10 +5013,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap int scanned; int reclaimed; LIST_HEAD(list); + LIST_HEAD(clean); struct folio *folio; + struct folio *next; enum vm_event_item item; struct reclaim_stat stat; struct lru_gen_mm_walk *walk; + bool skip_retry = false; struct mem_cgroup *memcg = lruvec_memcg(lruvec); struct pglist_data *pgdat = lruvec_pgdat(lruvec); @@ -4991,20 +5036,37 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; - +retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + sc->nr_reclaimed += reclaimed; - list_for_each_entry(folio, &list, lru) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); + list_for_each_entry_safe_reverse(folio, next, &list, lru) { + if (!folio_evictable(folio)) { + list_del(&folio->lru); + folio_putback_lru(folio); + continue; + } - /* don't add rejected pages to the oldest generation */ if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) - folio_clear_active(folio); - else - folio_set_active(folio); + (folio_test_dirty(folio) || folio_test_writeback(folio))) { + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ + if (folio_test_workingset(folio)) + folio_set_referenced(folio); + continue; + } + + if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || + folio_mapped(folio) || folio_test_locked(folio) || + folio_test_dirty(folio) || folio_test_writeback(folio)) { + /* don't add rejected folios to the oldest generation */ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, + BIT(PG_active)); + continue; + } + + /* retry folios that may have missed folio_rotate_reclaimable() */ + list_move(&folio->lru, &clean); + sc->nr_scanned -= folio_nr_pages(folio); } spin_lock_irq(&lruvec->lru_lock); @@ -5015,7 +5077,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (walk && walk->batched) reset_batch_size(lruvec, walk); - item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + item = PGSTEAL_KSWAPD + reclaimer_offset(); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); @@ -5026,7 +5088,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap mem_cgroup_uncharge_list(&list); free_unref_page_list(&list); - sc->nr_reclaimed += reclaimed; + INIT_LIST_HEAD(&list); + list_splice_init(&clean, &list); + + if (!list_empty(&list)) { + skip_retry = true; + goto retry; + } if (need_swapping && type == LRU_GEN_ANON) *need_swapping = true; @@ -5047,8 +5115,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control * DEFINE_MAX_SEQ(lruvec); DEFINE_MIN_SEQ(lruvec); - if (mem_cgroup_below_min(memcg) || - (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) || + (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) && + !sc->memcg_low_reclaim)) return 0; *need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); @@ -5289,9 +5358,6 @@ static void lru_gen_change_state(bool enabled) for_each_node(nid) { struct lruvec *lruvec = get_lruvec(memcg, nid); - if (!lruvec) - continue; - spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -5354,10 +5420,10 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK)) caps |= BIT(LRU_GEN_MM_WALK); - if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG)) + if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); - return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps); + return sysfs_emit(buf, "0x%04x\n", caps); } /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ @@ -5844,8 +5910,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + bool proportional_reclaim; struct blk_plug plug; - bool scan_adjusted; if (lru_gen_enabled()) { lru_gen_shrink_lruvec(lruvec, sc); @@ -5868,8 +5934,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) * abort proportional reclaim if either the file or anon lru has already * dropped to zero at the first pass. */ - scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() && - sc->priority == DEF_PRIORITY); + proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -5889,7 +5955,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) cond_resched(); - if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + if (nr_reclaimed < nr_to_reclaim || proportional_reclaim) continue; /* @@ -5940,8 +6006,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_scanned = targets[lru] - nr[lru]; nr[lru] = targets[lru] * (100 - percentage) / 100; nr[lru] -= min(nr[lru], nr_scanned); - - scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; @@ -6048,13 +6112,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) mem_cgroup_calculate_protection(target_memcg, memcg); - if (mem_cgroup_below_min(memcg)) { + if (mem_cgroup_below_min(target_memcg, memcg)) { /* * Hard protection. * If there is no reclaimable memory, OOM. */ continue; - } else if (mem_cgroup_below_low(memcg)) { + } else if (mem_cgroup_below_low(target_memcg, memcg)) { /* * Soft protection. * Respect the protection only as long as @@ -6690,7 +6754,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, - unsigned int reclaim_options) + unsigned int reclaim_options, + nodemask_t *nodemask) { unsigned long nr_reclaimed; unsigned int noreclaim_flag; @@ -6705,6 +6770,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_unmap = 1, .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), + .nodemask = nodemask, }; /* * Traverse the ZONELIST_FALLBACK zonelist of the current node to put diff --git a/mm/vmstat.c b/mm/vmstat.c index b2371d745e00..1ea6a5ce1c41 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1271,10 +1271,13 @@ const char * const vmstat_text[] = { "pgreuse", "pgsteal_kswapd", "pgsteal_direct", + "pgsteal_khugepaged", "pgdemote_kswapd", "pgdemote_direct", + "pgdemote_khugepaged", "pgscan_kswapd", "pgscan_direct", + "pgscan_khugepaged", "pgscan_direct_throttle", "pgscan_anon", "pgscan_file", diff --git a/mm/workingset.c b/mm/workingset.c index ae7e984b23c6..1a86645b7b3c 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -492,8 +492,11 @@ void workingset_refault(struct folio *folio, void *shadow) /* Folio was active prior to eviction */ if (workingset) { folio_set_workingset(folio); - /* XXX: Move to lru_cache_add() when it supports new vs putback */ - lru_note_cost_folio(folio); + /* + * XXX: Move to folio_add_lru() when it supports new vs + * putback + */ + lru_note_cost_refault(folio); mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); } out: diff --git a/mm/z3fold.c b/mm/z3fold.c index cf71da10d04e..a4de0c317ac7 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -68,9 +68,6 @@ * Structures *****************/ struct z3fold_pool; -struct z3fold_ops { - int (*evict)(struct z3fold_pool *pool, unsigned long handle); -}; enum buddy { HEADLESS = 0, @@ -138,8 +135,6 @@ struct z3fold_header { * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation - * @ops: pointer to a structure of user defined operations specified at - * pool creation time. * @zpool: zpool driver * @zpool_ops: zpool operations structure with an evict callback * @compact_wq: workqueue for page layout background optimization @@ -158,7 +153,6 @@ struct z3fold_pool { struct list_head stale; atomic64_t pages_nr; struct kmem_cache *c_handle; - const struct z3fold_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; @@ -907,13 +901,11 @@ out_fail: * z3fold_create_pool() - create a new z3fold pool * @name: pool name * @gfp: gfp flags when allocating the z3fold pool structure - * @ops: user-defined operations for the z3fold pool * * Return: pointer to the new z3fold pool or NULL if the metadata allocation * failed. */ -static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, - const struct z3fold_ops *ops) +static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp) { struct z3fold_pool *pool = NULL; int i, cpu; @@ -949,7 +941,6 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, if (!pool->release_wq) goto out_wq; INIT_WORK(&pool->work, free_pages_work); - pool->ops = ops; return pool; out_wq: @@ -1230,10 +1221,6 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE); spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || retries == 0) { - spin_unlock(&pool->lock); - return -EINVAL; - } for (i = 0; i < retries; i++) { if (list_empty(&pool->lru)) { spin_unlock(&pool->lock); @@ -1319,17 +1306,17 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) } /* Issue the eviction callback(s) */ if (middle_handle) { - ret = pool->ops->evict(pool, middle_handle); + ret = pool->zpool_ops->evict(pool->zpool, middle_handle); if (ret) goto next; } if (first_handle) { - ret = pool->ops->evict(pool, first_handle); + ret = pool->zpool_ops->evict(pool->zpool, first_handle); if (ret) goto next; } if (last_handle) { - ret = pool->ops->evict(pool, last_handle); + ret = pool->zpool_ops->evict(pool->zpool, last_handle); if (ret) goto next; } @@ -1593,26 +1580,13 @@ static const struct movable_operations z3fold_mops = { * zpool ****************/ -static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct z3fold_ops z3fold_zpool_ops = { - .evict = z3fold_zpool_evict -}; - static void *z3fold_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct z3fold_pool *pool; - pool = z3fold_create_pool(name, gfp, - zpool_ops ? &z3fold_zpool_ops : NULL); + pool = z3fold_create_pool(name, gfp); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zbud.c b/mm/zbud.c index 6348932430b8..3acd26193920 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -74,10 +74,6 @@ struct zbud_pool; -struct zbud_ops { - int (*evict)(struct zbud_pool *pool, unsigned long handle); -}; - /** * struct zbud_pool - stores metadata for each zbud pool * @lock: protects all pool fields and first|last_chunk fields of any @@ -90,8 +86,6 @@ struct zbud_ops { * @lru: list tracking the zbud pages in LRU order by most recently * added buddy. * @pages_nr: number of zbud pages in the pool. - * @ops: pointer to a structure of user defined operations specified at - * pool creation time. * @zpool: zpool driver * @zpool_ops: zpool operations structure with an evict callback * @@ -110,7 +104,6 @@ struct zbud_pool { }; struct list_head lru; u64 pages_nr; - const struct zbud_ops *ops; struct zpool *zpool; const struct zpool_ops *zpool_ops; }; @@ -212,12 +205,11 @@ static int num_free_chunks(struct zbud_header *zhdr) /** * zbud_create_pool() - create a new zbud pool * @gfp: gfp flags when allocating the zbud pool structure - * @ops: user-defined operations for the zbud pool * * Return: pointer to the new zbud pool or NULL if the metadata allocation * failed. */ -static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) +static struct zbud_pool *zbud_create_pool(gfp_t gfp) { struct zbud_pool *pool; int i; @@ -231,7 +223,6 @@ static struct zbud_pool *zbud_create_pool(gfp_t gfp, const struct zbud_ops *ops) INIT_LIST_HEAD(&pool->buddied); INIT_LIST_HEAD(&pool->lru); pool->pages_nr = 0; - pool->ops = ops; return pool; } @@ -419,8 +410,7 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) unsigned long first_handle = 0, last_handle = 0; spin_lock(&pool->lock); - if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || - retries == 0) { + if (list_empty(&pool->lru)) { spin_unlock(&pool->lock); return -EINVAL; } @@ -444,12 +434,12 @@ static int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) /* Issue the eviction callback(s) */ if (first_handle) { - ret = pool->ops->evict(pool, first_handle); + ret = pool->zpool_ops->evict(pool->zpool, first_handle); if (ret) goto next; } if (last_handle) { - ret = pool->ops->evict(pool, last_handle); + ret = pool->zpool_ops->evict(pool->zpool, last_handle); if (ret) goto next; } @@ -524,25 +514,13 @@ static u64 zbud_get_pool_size(struct zbud_pool *pool) * zpool ****************/ -static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) -{ - if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) - return pool->zpool_ops->evict(pool->zpool, handle); - else - return -ENOENT; -} - -static const struct zbud_ops zbud_zpool_ops = { - .evict = zbud_zpool_evict -}; - static void *zbud_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { struct zbud_pool *pool; - pool = zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); + pool = zbud_create_pool(gfp); if (pool) { pool->zpool = zpool; pool->zpool_ops = zpool_ops; diff --git a/mm/zpool.c b/mm/zpool.c index 68facc193496..571f5c5031dd 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,9 +21,6 @@ struct zpool { struct zpool_driver *driver; void *pool; - const struct zpool_ops *ops; - bool evictable; - bool can_sleep_mapped; }; static LIST_HEAD(drivers_head); @@ -177,9 +174,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); - zpool->ops = ops; - zpool->evictable = driver->shrink && ops && ops->evict; - zpool->can_sleep_mapped = driver->sleep_mapped; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -380,18 +374,25 @@ u64 zpool_get_total_size(struct zpool *zpool) */ bool zpool_evictable(struct zpool *zpool) { - return zpool->evictable; + return zpool->driver->shrink; } /** * zpool_can_sleep_mapped - Test if zpool can sleep when do mapped. * @zpool: The zpool to test * + * Some allocators enter non-preemptible context in ->map() callback (e.g. + * disable pagefaults) and exit that context in ->unmap(), which limits what + * we can do with the mapped object. For instance, we cannot wait for + * asynchronous crypto API to decompress such an object or take mutexes + * since those will call into the scheduler. This function tells us whether + * we use such an allocator. + * * Returns: true if zpool can sleep; false otherwise. */ bool zpool_can_sleep_mapped(struct zpool *zpool) { - return zpool->can_sleep_mapped; + return zpool->driver->sleep_mapped; } MODULE_LICENSE("GPL"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 525758713a55..9445bee6b014 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -33,8 +33,7 @@ /* * lock ordering: * page_lock - * pool->migrate_lock - * class->lock + * pool->lock * zspage->lock */ @@ -192,7 +191,6 @@ static const int fullness_threshold_frac = 4; static size_t huge_class_size; struct size_class { - spinlock_t lock; struct list_head fullness_list[NR_ZS_FULLNESS]; /* * Size of objects stored in this class. Must be multiple @@ -241,14 +239,20 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; +#ifdef CONFIG_ZPOOL + /* List tracking the zspages in LRU order by most recently added object */ + struct list_head lru; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +#endif + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif - /* protect page/zspage migration */ - rwlock_t migrate_lock; + spinlock_t lock; }; struct zspage { @@ -263,10 +267,17 @@ struct zspage { unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ + +#ifdef CONFIG_ZPOOL + /* links the zspage to the lru list in the pool */ + struct list_head lru; + bool under_reclaim; + /* list of unfreed handles whose objects have been reclaimed */ + unsigned long *deferred_handles; +#endif + struct zs_pool *pool; -#ifdef CONFIG_COMPACTION rwlock_t lock; -#endif }; struct mapping_area { @@ -287,10 +298,11 @@ static bool ZsHugePage(struct zspage *zspage) return zspage->huge; } -#ifdef CONFIG_COMPACTION static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); + +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); @@ -298,9 +310,6 @@ static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else -static void migrate_lock_init(struct zspage *zspage) {} -static void migrate_read_lock(struct zspage *zspage) {} -static void migrate_read_unlock(struct zspage *zspage) {} static void migrate_write_lock(struct zspage *zspage) {} static void migrate_write_lock_nested(struct zspage *zspage) {} static void migrate_write_unlock(struct zspage *zspage) {} @@ -355,7 +364,7 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) kmem_cache_free(pool->zspage_cachep, zspage); } -/* class->lock(which owns the handle) synchronizes races */ +/* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { *(unsigned long *)handle = obj; @@ -374,7 +383,14 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, * different contexts and its caller must provide a valid * gfp mask. */ - return zs_create_pool(name); + struct zs_pool *pool = zs_create_pool(name); + + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + + return pool; } static void zs_zpool_destroy(void *pool) @@ -387,7 +403,7 @@ static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, { *handle = zs_malloc(pool, size, gfp); - if (IS_ERR((void *)(*handle))) + if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); return 0; } @@ -396,6 +412,27 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries); + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zs_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -434,6 +471,7 @@ static struct zpool_driver zs_zpool_driver = { .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, + .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, @@ -452,7 +490,7 @@ static __maybe_unused int is_first_page(struct page *page) return PagePrivate(page); } -/* Protected by class->lock */ +/* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; @@ -597,13 +635,13 @@ static int zs_stats_size_show(struct seq_file *s, void *v) if (class->index != i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); class_almost_full = zs_stat_get(class, CLASS_ALMOST_FULL); class_almost_empty = zs_stat_get(class, CLASS_ALMOST_EMPTY); obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); obj_used = zs_stat_get(class, OBJ_USED); freeable = zs_can_compact(class); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * @@ -907,6 +945,25 @@ unlock: return 0; } +#ifdef CONFIG_ZPOOL +/* + * Free all the deferred handles whose objects are freed in zs_free. + */ +static void free_handles(struct zs_pool *pool, struct zspage *zspage) +{ + unsigned long handle = (unsigned long)zspage->deferred_handles; + + while (handle) { + unsigned long nxt_handle = handle_to_obj(handle); + + cache_free_handle(pool, handle); + handle = nxt_handle; + } +} +#else +static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} +#endif + static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { @@ -916,11 +973,14 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, get_zspage_mapping(zspage, &class_idx, &fg); - assert_spin_locked(&class->lock); + assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg != ZS_EMPTY); + /* Free all deferred handles from zs_free */ + free_handles(pool, zspage); + next = page = get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); @@ -956,6 +1016,9 @@ static void free_zspage(struct zs_pool *pool, struct size_class *class, } remove_zspage(class, zspage, ZS_EMPTY); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); } @@ -1001,6 +1064,12 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) off %= PAGE_SIZE; } +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&zspage->lru); + zspage->under_reclaim = false; + zspage->deferred_handles = NULL; +#endif + set_freeobj(zspage, 0); } @@ -1205,6 +1274,27 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +/** + * zs_lookup_class_index() - Returns index of the zsmalloc &size_class + * that hold objects of the provided size. + * @pool: zsmalloc pool to use + * @size: object size + * + * Context: Any context. + * + * Return: the index of the zsmalloc &size_class that hold objects of the + * provided size. + */ +unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) +{ + struct size_class *class; + + class = pool->size_class[get_size_class_index(size)]; + + return class->index; +} +EXPORT_SYMBOL_GPL(zs_lookup_class_index); + unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); @@ -1247,19 +1337,44 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, BUG_ON(in_interrupt()); /* It guarantees it can get zspage from handle safely */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); +#ifdef CONFIG_ZPOOL + /* + * Move the zspage to front of pool's LRU. + * + * Note that this is swap-specific, so by definition there are no ongoing + * accesses to the memory while the page is swapped out that would make + * it "hot". A new entry is hot, then ages to the tail until it gets either + * written back or swaps back in. + * + * Furthermore, map is also called during writeback. We must not put an + * isolated page on the LRU mid-reclaim. + * + * As a result, only update the LRU when the page is mapped for write + * when it's first instantiated. + * + * This is a deviation from the other backends, which perform this update + * in the allocation function (zbud_alloc, z3fold_alloc). + */ + if (mm == ZS_MM_WO) { + if (!list_empty(&zspage->lru)) + list_del(&zspage->lru); + list_add(&zspage->lru, &pool->lru); + } +#endif + /* - * migration cannot move any zpages in this zspage. Here, class->lock + * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ migrate_read_lock(zspage); - read_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); class = zspage_class(pool, zspage); off = (class->size * obj_idx) & ~PAGE_MASK; @@ -1412,8 +1527,8 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; - /* class->lock effectively protects the zpage migration */ - spin_lock(&class->lock); + /* pool->lock effectively protects the zpage migration */ + spin_lock(&pool->lock); zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); @@ -1421,12 +1536,12 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, OBJ_USED, 1); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } - spin_unlock(&class->lock); + spin_unlock(&pool->lock); zspage = alloc_zspage(pool, class, gfp); if (!zspage) { @@ -1434,7 +1549,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) return (unsigned long)ERR_PTR(-ENOMEM); } - spin_lock(&class->lock); + spin_lock(&pool->lock); obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); @@ -1447,7 +1562,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); return handle; } @@ -1491,26 +1606,38 @@ void zs_free(struct zs_pool *pool, unsigned long handle) return; /* - * The pool->migrate_lock protects the race with zpage's migration + * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ - read_lock(&pool->migrate_lock); + spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); class = zspage_class(pool, zspage); - spin_lock(&class->lock); - read_unlock(&pool->migrate_lock); obj_free(class->size, obj); class_stat_dec(class, OBJ_USED, 1); + +#ifdef CONFIG_ZPOOL + if (zspage->under_reclaim) { + /* + * Reclaim needs the handles during writeback. It'll free + * them along with the zspage when it's done with them. + * + * Record current deferred handle at the memory location + * whose address is given by handle. + */ + record_obj(handle, (unsigned long)zspage->deferred_handles); + zspage->deferred_handles = (unsigned long *)handle; + spin_unlock(&pool->lock); + return; + } +#endif fullness = fix_fullness_group(class, zspage); - if (fullness != ZS_EMPTY) - goto out; + if (fullness == ZS_EMPTY) + free_zspage(pool, class, zspage); - free_zspage(pool, class, zspage); -out: - spin_unlock(&class->lock); + spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); @@ -1709,7 +1836,7 @@ static enum fullness_group putback_zspage(struct size_class *class, return fullness; } -#ifdef CONFIG_COMPACTION +#if defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. @@ -1751,6 +1878,24 @@ static void lock_zspage(struct zspage *zspage) } migrate_read_unlock(zspage); } +#endif /* defined(CONFIG_ZPOOL) || defined(CONFIG_COMPACTION) */ + +#ifdef CONFIG_ZPOOL +/* + * Unlocks all the pages of the zspage. + * + * pool->lock must be held before this function is called + * to prevent the underlying pages from migrating. + */ +static void unlock_zspage(struct zspage *zspage) +{ + struct page *page = get_first_page(zspage); + + do { + unlock_page(page); + } while ((page = get_next_page(page)) != NULL); +} +#endif /* CONFIG_ZPOOL */ static void migrate_lock_init(struct zspage *zspage) { @@ -1767,6 +1912,7 @@ static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) read_unlock(&zspage->lock); } +#ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); @@ -1867,16 +2013,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, pool = zspage->pool; /* - * The pool migrate_lock protects the race between zpage migration + * The pool's lock protects the race between zpage migration * and zs_free. */ - write_lock(&pool->migrate_lock); + spin_lock(&pool->lock); class = zspage_class(pool, zspage); - /* - * the class lock protects zpage alloc/free in the zspage. - */ - spin_lock(&class->lock); /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); @@ -1906,10 +2048,9 @@ static int zs_page_migrate(struct page *newpage, struct page *page, replace_sub_page(class, zspage, newpage, page); /* * Since we complete the data copy and set up new zspage structure, - * it's okay to release migration_lock. + * it's okay to release the pool's lock. */ - write_unlock(&pool->migrate_lock); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); dec_zspage_isolation(zspage); migrate_write_unlock(zspage); @@ -1964,9 +2105,9 @@ static void async_free_zspage(struct work_struct *work) if (class->index != i) continue; - spin_lock(&class->lock); + spin_lock(&pool->lock); list_splice_init(&class->fullness_list[ZS_EMPTY], &free_pages); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } list_for_each_entry_safe(zspage, tmp, &free_pages, list) { @@ -1976,9 +2117,12 @@ static void async_free_zspage(struct work_struct *work) get_zspage_mapping(zspage, &class_idx, &fullness); VM_BUG_ON(fullness != ZS_EMPTY); class = pool->size_class[class_idx]; - spin_lock(&class->lock); + spin_lock(&pool->lock); +#ifdef CONFIG_ZPOOL + list_del(&zspage->lru); +#endif __free_zspage(pool, class, zspage); - spin_unlock(&class->lock); + spin_unlock(&pool->lock); } }; @@ -2039,10 +2183,11 @@ static unsigned long __zs_compact(struct zs_pool *pool, struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; - /* protect the race between zpage migration and zs_free */ - write_lock(&pool->migrate_lock); - /* protect zpage allocation/free */ - spin_lock(&class->lock); + /* + * protect the race between zpage migration and zs_free + * as well as zpage allocation/free + */ + spin_lock(&pool->lock); while ((src_zspage = isolate_zspage(class, true))) { /* protect someone accessing the zspage(i.e., zs_map_object) */ migrate_write_lock(src_zspage); @@ -2067,7 +2212,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, putback_zspage(class, dst_zspage); migrate_write_unlock(dst_zspage); dst_zspage = NULL; - if (rwlock_is_contended(&pool->migrate_lock)) + if (spin_is_contended(&pool->lock)) break; } @@ -2084,11 +2229,9 @@ static unsigned long __zs_compact(struct zs_pool *pool, pages_freed += class->pages_per_zspage; } else migrate_write_unlock(src_zspage); - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); cond_resched(); - write_lock(&pool->migrate_lock); - spin_lock(&class->lock); + spin_lock(&pool->lock); } if (src_zspage) { @@ -2096,8 +2239,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, migrate_write_unlock(src_zspage); } - spin_unlock(&class->lock); - write_unlock(&pool->migrate_lock); + spin_unlock(&pool->lock); return pages_freed; } @@ -2200,7 +2342,7 @@ struct zs_pool *zs_create_pool(const char *name) return NULL; init_deferred_free(pool); - rwlock_init(&pool->migrate_lock); + spin_lock_init(&pool->lock); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) @@ -2271,7 +2413,6 @@ struct zs_pool *zs_create_pool(const char *name) class->index = i; class->pages_per_zspage = pages_per_zspage; class->objs_per_zspage = objs_per_zspage; - spin_lock_init(&class->lock); pool->size_class[i] = class; for (fullness = ZS_EMPTY; fullness < NR_ZS_FULLNESS; fullness++) @@ -2291,6 +2432,10 @@ struct zs_pool *zs_create_pool(const char *name) */ zs_register_shrinker(pool); +#ifdef CONFIG_ZPOOL + INIT_LIST_HEAD(&pool->lru); +#endif + return pool; err: @@ -2311,6 +2456,9 @@ void zs_destroy_pool(struct zs_pool *pool) int fg; struct size_class *class = pool->size_class[i]; + if (!class) + continue; + if (class->index != i) continue; @@ -2329,6 +2477,100 @@ void zs_destroy_pool(struct zs_pool *pool) } EXPORT_SYMBOL_GPL(zs_destroy_pool); +#ifdef CONFIG_ZPOOL +static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) +{ + int i, obj_idx, ret = 0; + unsigned long handle; + struct zspage *zspage; + struct page *page; + enum fullness_group fullness; + + /* Lock LRU and fullness list */ + spin_lock(&pool->lock); + if (list_empty(&pool->lru)) { + spin_unlock(&pool->lock); + return -EINVAL; + } + + for (i = 0; i < retries; i++) { + struct size_class *class; + + zspage = list_last_entry(&pool->lru, struct zspage, lru); + list_del(&zspage->lru); + + /* zs_free may free objects, but not the zspage and handles */ + zspage->under_reclaim = true; + + class = zspage_class(pool, zspage); + fullness = get_fullness_group(class, zspage); + + /* Lock out object allocations and object compaction */ + remove_zspage(class, zspage, fullness); + + spin_unlock(&pool->lock); + cond_resched(); + + /* Lock backing pages into place */ + lock_zspage(zspage); + + obj_idx = 0; + page = get_first_page(zspage); + while (1) { + handle = find_alloced_obj(class, page, &obj_idx); + if (!handle) { + page = get_next_page(page); + if (!page) + break; + obj_idx = 0; + continue; + } + + /* + * This will write the object and call zs_free. + * + * zs_free will free the object, but the + * under_reclaim flag prevents it from freeing + * the zspage altogether. This is necessary so + * that we can continue working with the + * zspage potentially after the last object + * has been freed. + */ + ret = pool->zpool_ops->evict(pool->zpool, handle); + if (ret) + goto next; + + obj_idx++; + } + +next: + /* For freeing the zspage, or putting it back in the pool and LRU list. */ + spin_lock(&pool->lock); + zspage->under_reclaim = false; + + if (!get_zspage_inuse(zspage)) { + /* + * Fullness went stale as zs_free() won't touch it + * while the page is removed from the pool. Fix it + * up for the check in __free_zspage(). + */ + zspage->fullness = ZS_EMPTY; + + __free_zspage(pool, class, zspage); + spin_unlock(&pool->lock); + return 0; + } + + putback_zspage(class, zspage); + list_add(&zspage->lru, &pool->lru); + unlock_zspage(zspage); + } + + spin_unlock(&pool->lock); + return -EAGAIN; +} +#endif /* CONFIG_ZPOOL */ + static int __init zs_init(void) { int ret; diff --git a/mm/zswap.c b/mm/zswap.c index 2d48fd59cc7a..f6c89049cf70 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -958,7 +958,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) }; if (!zpool_can_sleep_mapped(pool)) { - tmp = kmalloc(PAGE_SIZE, GFP_ATOMIC); + tmp = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!tmp) return -ENOMEM; } @@ -968,6 +968,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) swpentry = zhdr->swpentry; /* here */ tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); + zpool_unmap_handle(pool, handle); /* find and ref zswap entry */ spin_lock(&tree->lock); @@ -975,20 +976,12 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); - zpool_unmap_handle(pool, handle); kfree(tmp); return 0; } spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); - src = (u8 *)zhdr + sizeof(struct zswap_header); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src = tmp; - zpool_unmap_handle(pool, handle); - } - /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ @@ -1006,6 +999,14 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); dlen = PAGE_SIZE; + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); + src = (u8 *)zhdr + sizeof(struct zswap_header); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, handle); + } + mutex_lock(acomp_ctx->mutex); sg_init_one(&input, src, entry->length); sg_init_table(&output, 1); @@ -1015,6 +1016,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) dlen = acomp_ctx->req->dlen; mutex_unlock(acomp_ctx->mutex); + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); + else + zpool_unmap_handle(pool, handle); + BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -1045,7 +1051,11 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - goto end; + return ret; + +fail: + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); /* * if we get here due to ZSWAP_SWAPCACHE_EXIST @@ -1054,17 +1064,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) * if we free the entry in the following put * it is also okay to return !0 */ -fail: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); -end: - if (zpool_can_sleep_mapped(pool)) - zpool_unmap_handle(pool, handle); - else - kfree(tmp); - return ret; } @@ -1311,7 +1314,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } if (!zpool_can_sleep_mapped(entry->pool->zpool)) { - tmp = kmalloc(entry->length, GFP_ATOMIC); + tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto freeentry; |