summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-05-27 11:40:49 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-05-27 11:40:49 -0700
commit8291eaafed36f575f23951f3ce18407f480e9ecf (patch)
tree279b61422ba2df7b8579af8ccc81331de80affa8
parent77fb622de1393b1d54f24f4f7ed98f84feeda502 (diff)
parentfa020a2b87d24016723fff4a4237deb612478a32 (diff)
downloadlinux-8291eaafed36f575f23951f3ce18407f480e9ecf.tar.gz
linux-8291eaafed36f575f23951f3ce18407f480e9ecf.tar.bz2
linux-8291eaafed36f575f23951f3ce18407f480e9ecf.zip
Merge tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull more MM updates from Andrew Morton: - Two follow-on fixes for the post-5.19 series "Use pageblock_order for cma and alloc_contig_range alignment", from Zi Yan. - A series of z3fold cleanups and fixes from Miaohe Lin. - Some memcg selftests work from Michal Koutný <mkoutny@suse.com> - Some swap fixes and cleanups from Miaohe Lin - Several individual minor fixups * tag 'mm-stable-2022-05-27' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (25 commits) mm/shmem.c: suppress shift warning mm: Kconfig: reorganize misplaced mm options mm: kasan: fix input of vmalloc_to_page() mm: fix is_pinnable_page against a cma page mm: filter out swapin error entry in shmem mapping mm/shmem: fix infinite loop when swap in shmem error at swapoff time mm/madvise: free hwpoison and swapin error entry in madvise_free_pte_range mm/swapfile: fix lost swap bits in unuse_pte() mm/swapfile: unuse_pte can map random data if swap read fails selftests: memcg: factor out common parts of memory.{low,min} tests selftests: memcg: remove protection from top level memcg selftests: memcg: adjust expected reclaim values of protected cgroups selftests: memcg: expect no low events in unprotected sibling selftests: memcg: fix compilation mm/z3fold: fix z3fold_page_migrate races with z3fold_map mm/z3fold: fix z3fold_reclaim_page races with z3fold_free mm/z3fold: always clear PAGE_CLAIMED under z3fold page lock mm/z3fold: put z3fold page back into unbuddied list when reclaim or migration fails revert "mm/z3fold.c: allow __GFP_HIGHMEM in z3fold_alloc" mm/z3fold: throw warning on failure of trylock_page in z3fold_alloc ...
-rw-r--r--MAINTAINERS1
-rw-r--r--include/linux/mm.h9
-rw-r--r--include/linux/swap.h7
-rw-r--r--include/linux/swapops.h10
-rw-r--r--init/Kconfig54
-rw-r--r--lib/Kconfig.debug35
-rw-r--r--mm/Kconfig56
-rw-r--r--mm/Kconfig.debug33
-rw-r--r--mm/internal.h4
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/madvise.c18
-rw-r--r--mm/memory.c5
-rw-r--r--mm/page_alloc.c32
-rw-r--r--mm/page_isolation.c36
-rw-r--r--mm/shmem.c41
-rw-r--r--mm/swap_state.c3
-rw-r--r--mm/swapfile.c21
-rw-r--r--mm/z3fold.c97
-rw-r--r--tools/testing/selftests/cgroup/memcg_protection.m89
-rw-r--r--tools/testing/selftests/cgroup/test_memcontrol.c247
20 files changed, 436 insertions, 364 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 58548f79e50a..392467e9ab73 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5062,6 +5062,7 @@ L: linux-mm@kvack.org
S: Maintained
F: mm/memcontrol.c
F: mm/swap_cgroup.c
+F: tools/testing/selftests/cgroup/memcg_protection.m
F: tools/testing/selftests/cgroup/test_kmem.c
F: tools/testing/selftests/cgroup/test_memcontrol.c
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f85e5a4c070b..bc8f326be0ce 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1594,8 +1594,13 @@ static inline bool page_needs_cow_for_dma(struct vm_area_struct *vma,
#ifdef CONFIG_MIGRATION
static inline bool is_pinnable_page(struct page *page)
{
- return !(is_zone_movable_page(page) || is_migrate_cma_page(page)) ||
- is_zero_pfn(page_to_pfn(page));
+#ifdef CONFIG_CMA
+ int mt = get_pageblock_migratetype(page);
+
+ if (mt == MIGRATE_CMA || mt == MIGRATE_ISOLATE)
+ return false;
+#endif
+ return !(is_zone_movable_page(page) || is_zero_pfn(page_to_pfn(page)));
}
#else
static inline bool is_pinnable_page(struct page *page)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index f3ae17b43f20..0c0fed1b348f 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -55,6 +55,10 @@ static inline int current_is_kswapd(void)
* actions on faults.
*/
+#define SWP_SWAPIN_ERROR_NUM 1
+#define SWP_SWAPIN_ERROR (MAX_SWAPFILES + SWP_HWPOISON_NUM + \
+ SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \
+ SWP_PTE_MARKER_NUM)
/*
* PTE markers are used to persist information onto PTEs that are mapped with
* file-backed memories. As its name "PTE" hints, it should only be applied to
@@ -120,7 +124,8 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \
((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \
- SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_PTE_MARKER_NUM)
+ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \
+ SWP_PTE_MARKER_NUM - SWP_SWAPIN_ERROR_NUM)
/*
* Magic header for a swap area. The first part of the union is
diff --git a/include/linux/swapops.h b/include/linux/swapops.h
index fe220df499f1..f24775b41880 100644
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -108,6 +108,16 @@ static inline void *swp_to_radix_entry(swp_entry_t entry)
return xa_mk_value(entry.val);
}
+static inline swp_entry_t make_swapin_error_entry(struct page *page)
+{
+ return swp_entry(SWP_SWAPIN_ERROR, page_to_pfn(page));
+}
+
+static inline int is_swapin_error_entry(swp_entry_t entry)
+{
+ return swp_type(entry) == SWP_SWAPIN_ERROR;
+}
+
#if IS_ENABLED(CONFIG_DEVICE_PRIVATE)
static inline swp_entry_t make_readable_device_private_entry(pgoff_t offset)
{
diff --git a/init/Kconfig b/init/Kconfig
index 7efe7c56893f..583675727b85 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1842,60 +1842,6 @@ config DEBUG_PERF_USE_VMALLOC
endmenu
-config VM_EVENT_COUNTERS
- default y
- bool "Enable VM event counters for /proc/vmstat" if EXPERT
- help
- VM event counters are needed for event counts to be shown.
- This option allows the disabling of the VM event counters
- on EXPERT systems. /proc/vmstat will only show page counts
- if VM event counters are disabled.
-
-config SLUB_DEBUG
- default y
- bool "Enable SLUB debugging support" if EXPERT
- depends on SLUB && SYSFS
- select STACKDEPOT if STACKTRACE_SUPPORT
- help
- SLUB has extensive debug support features. Disabling these can
- result in significant savings in code size. This also disables
- SLUB sysfs support. /sys/slab will not exist and there will be
- no support for cache validation etc.
-
-config COMPAT_BRK
- bool "Disable heap randomization"
- default y
- help
- Randomizing heap placement makes heap exploits harder, but it
- also breaks ancient binaries (including anything libc5 based).
- This option changes the bootup default to heap randomization
- disabled, and can be overridden at runtime by setting
- /proc/sys/kernel/randomize_va_space to 2.
-
- On non-ancient distros (post-2000 ones) N is usually a safe choice.
-
-config MMAP_ALLOW_UNINITIALIZED
- bool "Allow mmapped anonymous memory to be uninitialized"
- depends on EXPERT && !MMU
- default n
- help
- Normally, and according to the Linux spec, anonymous memory obtained
- from mmap() has its contents cleared before it is passed to
- userspace. Enabling this config option allows you to request that
- mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
- providing a huge performance boost. If this option is not enabled,
- then the flag will be ignored.
-
- This is taken advantage of by uClibc's malloc(), and also by
- ELF-FDPIC binfmt's brk and stack allocator.
-
- Because of the obvious security issues, this option should only be
- enabled on embedded devices where you control what is run in
- userspace. Since that isn't generally a problem on no-MMU systems,
- it is normally safe to say Y here.
-
- See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
-
config SYSTEM_DATA_VERIFICATION
def_bool n
select SYSTEM_TRUSTED_KEYRING
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b1e506f1d328..4cea85a83321 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -699,41 +699,6 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT
help
Debug objects boot parameter default value
-config DEBUG_SLAB
- bool "Debug slab memory allocations"
- depends on DEBUG_KERNEL && SLAB
- help
- Say Y here to have the kernel do limited verification on memory
- allocation as well as poisoning memory on free to catch use of freed
- memory. This can make kmalloc/kfree-intensive workloads much slower.
-
-config SLUB_DEBUG_ON
- bool "SLUB debugging on by default"
- depends on SLUB && SLUB_DEBUG
- select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
- default n
- help
- Boot with debugging on by default. SLUB boots by default with
- the runtime debug capabilities switched off. Enabling this is
- equivalent to specifying the "slub_debug" parameter on boot.
- There is no support for more fine grained debug control like
- possible with slub_debug=xxx. SLUB debugging may be switched
- off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
- "slub_debug=-".
-
-config SLUB_STATS
- default n
- bool "Enable SLUB performance statistics"
- depends on SLUB && SYSFS
- help
- SLUB statistics are useful to debug SLUBs allocation behavior in
- order find ways to optimize the allocator. This should never be
- enabled for production use since keeping statistics slows down
- the allocator by a few percentage points. The slabinfo command
- supports the determination of the most active slabs to figure
- out which slabs are relevant to a particular load.
- Try running: slabinfo -DA
-
config HAVE_DEBUG_KMEMLEAK
bool
diff --git a/mm/Kconfig b/mm/Kconfig
index 905c205e14f3..169e64192e48 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -270,6 +270,19 @@ config SLAB_FREELIST_HARDENED
sanity-checking than others. This option is most effective with
CONFIG_SLUB.
+config SLUB_STATS
+ default n
+ bool "Enable SLUB performance statistics"
+ depends on SLUB && SYSFS
+ help
+ SLUB statistics are useful to debug SLUBs allocation behavior in
+ order find ways to optimize the allocator. This should never be
+ enabled for production use since keeping statistics slows down
+ the allocator by a few percentage points. The slabinfo command
+ supports the determination of the most active slabs to figure
+ out which slabs are relevant to a particular load.
+ Try running: slabinfo -DA
+
config SLUB_CPU_PARTIAL
default y
depends on SLUB && SMP
@@ -307,6 +320,40 @@ config SHUFFLE_PAGE_ALLOCATOR
Say Y if unsure.
+config COMPAT_BRK
+ bool "Disable heap randomization"
+ default y
+ help
+ Randomizing heap placement makes heap exploits harder, but it
+ also breaks ancient binaries (including anything libc5 based).
+ This option changes the bootup default to heap randomization
+ disabled, and can be overridden at runtime by setting
+ /proc/sys/kernel/randomize_va_space to 2.
+
+ On non-ancient distros (post-2000 ones) N is usually a safe choice.
+
+config MMAP_ALLOW_UNINITIALIZED
+ bool "Allow mmapped anonymous memory to be uninitialized"
+ depends on EXPERT && !MMU
+ default n
+ help
+ Normally, and according to the Linux spec, anonymous memory obtained
+ from mmap() has its contents cleared before it is passed to
+ userspace. Enabling this config option allows you to request that
+ mmap() skip that if it is given an MAP_UNINITIALIZED flag, thus
+ providing a huge performance boost. If this option is not enabled,
+ then the flag will be ignored.
+
+ This is taken advantage of by uClibc's malloc(), and also by
+ ELF-FDPIC binfmt's brk and stack allocator.
+
+ Because of the obvious security issues, this option should only be
+ enabled on embedded devices where you control what is run in
+ userspace. Since that isn't generally a problem on no-MMU systems,
+ it is normally safe to say Y here.
+
+ See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
+
config SELECT_MEMORY_MODEL
def_bool y
depends on ARCH_SELECT_MEMORY_MODEL
@@ -964,6 +1011,15 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
+config VM_EVENT_COUNTERS
+ default y
+ bool "Enable VM event counters for /proc/vmstat" if EXPERT
+ help
+ VM event counters are needed for event counts to be shown.
+ This option allows the disabling of the VM event counters
+ on EXPERT systems. /proc/vmstat will only show page counts
+ if VM event counters are disabled.
+
config PERCPU_STATS
bool "Collect percpu memory statistics"
help
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 5bd5bb097252..ce8dded36de9 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -45,6 +45,39 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config DEBUG_SLAB
+ bool "Debug slab memory allocations"
+ depends on DEBUG_KERNEL && SLAB
+ help
+ Say Y here to have the kernel do limited verification on memory
+ allocation as well as poisoning memory on free to catch use of freed
+ memory. This can make kmalloc/kfree-intensive workloads much slower.
+
+config SLUB_DEBUG
+ default y
+ bool "Enable SLUB debugging support" if EXPERT
+ depends on SLUB && SYSFS
+ select STACKDEPOT if STACKTRACE_SUPPORT
+ help
+ SLUB has extensive debug support features. Disabling these can
+ result in significant savings in code size. This also disables
+ SLUB sysfs support. /sys/slab will not exist and there will be
+ no support for cache validation etc.
+
+config SLUB_DEBUG_ON
+ bool "SLUB debugging on by default"
+ depends on SLUB && SLUB_DEBUG
+ select STACKDEPOT_ALWAYS_INIT if STACKTRACE_SUPPORT
+ default n
+ help
+ Boot with debugging on by default. SLUB boots by default with
+ the runtime debug capabilities switched off. Enabling this is
+ equivalent to specifying the "slub_debug" parameter on boot.
+ There is no support for more fine grained debug control like
+ possible with slub_debug=xxx. SLUB debugging may be switched
+ off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
+ "slub_debug=-".
+
config PAGE_OWNER
bool "Track page owner"
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
diff --git a/mm/internal.h b/mm/internal.h
index 64e61b032dac..c0f8fbe0445b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -374,8 +374,8 @@ extern void *memmap_alloc(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr,
int nid, bool exact_nid);
-void split_free_page(struct page *free_page,
- int order, unsigned long split_pfn_offset);
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 199d77cce21a..b341a191651d 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -347,7 +347,7 @@ static void print_address_description(void *addr, u8 tag)
va->addr, va->addr + va->size, va->caller);
pr_err("\n");
- page = vmalloc_to_page(page);
+ page = vmalloc_to_page(addr);
}
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 4d6592488b51..d7b4f2602949 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -248,10 +248,13 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
if (!xa_is_value(page))
continue;
+ swap = radix_to_swp_entry(page);
+ /* There might be swapin error entries in shmem mapping. */
+ if (non_swap_entry(swap))
+ continue;
xas_pause(&xas);
rcu_read_unlock();
- swap = radix_to_swp_entry(page);
page = read_swap_cache_async(swap, GFP_HIGHUSER_MOVABLE,
NULL, 0, false, &splug);
if (page)
@@ -624,11 +627,14 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
swp_entry_t entry;
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry))
- continue;
- nr_swap--;
- free_swap_and_cache(entry);
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ if (!non_swap_entry(entry)) {
+ nr_swap--;
+ free_swap_and_cache(entry);
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ }
continue;
}
diff --git a/mm/memory.c b/mm/memory.c
index 54bcd5327b74..21dadf03f089 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1487,7 +1487,8 @@ again:
/* Only drop the uffd-wp marker if explicitly requested */
if (!zap_drop_file_uffd_wp(details))
continue;
- } else if (is_hwpoison_entry(entry)) {
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
if (!should_zap_cows(details))
continue;
} else {
@@ -3727,6 +3728,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
+ } else if (is_swapin_error_entry(entry)) {
+ ret = VM_FAULT_SIGBUS;
} else if (is_pte_marker_entry(entry)) {
ret = handle_pte_marker(vmf);
} else {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 149f2ab5063b..e008a3df0485 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -482,8 +482,12 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page,
bitidx = pfn_to_bitidx(page, pfn);
word_bitidx = bitidx / BITS_PER_LONG;
bitidx &= (BITS_PER_LONG-1);
-
- word = bitmap[word_bitidx];
+ /*
+ * This races, without locks, with set_pfnblock_flags_mask(). Ensure
+ * a consistent read of the memory array, so that results, even though
+ * racy, are not corrupted.
+ */
+ word = READ_ONCE(bitmap[word_bitidx]);
return (word >> bitidx) & mask;
}
@@ -1100,30 +1104,44 @@ done_merging:
* @order: the order of the page
* @split_pfn_offset: split offset within the page
*
+ * Return -ENOENT if the free page is changed, otherwise 0
+ *
* It is used when the free page crosses two pageblocks with different migratetypes
* at split_pfn_offset within the page. The split free page will be put into
* separate migratetype lists afterwards. Otherwise, the function achieves
* nothing.
*/
-void split_free_page(struct page *free_page,
- int order, unsigned long split_pfn_offset)
+int split_free_page(struct page *free_page,
+ unsigned int order, unsigned long split_pfn_offset)
{
struct zone *zone = page_zone(free_page);
unsigned long free_page_pfn = page_to_pfn(free_page);
unsigned long pfn;
unsigned long flags;
int free_page_order;
+ int mt;
+ int ret = 0;
if (split_pfn_offset == 0)
- return;
+ return ret;
spin_lock_irqsave(&zone->lock, flags);
+
+ if (!PageBuddy(free_page) || buddy_order(free_page) != order) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mt = get_pageblock_migratetype(free_page);
+ if (likely(!is_migrate_isolate(mt)))
+ __mod_zone_freepage_state(zone, -(1UL << order), mt);
+
del_page_from_free_list(free_page, zone, order);
for (pfn = free_page_pfn;
pfn < free_page_pfn + (1UL << order);) {
int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn);
- free_page_order = min_t(int,
+ free_page_order = min_t(unsigned int,
pfn ? __ffs(pfn) : order,
__fls(split_pfn_offset));
__free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order,
@@ -1134,7 +1152,9 @@ void split_free_page(struct page *free_page,
if (split_pfn_offset == 0)
split_pfn_offset = (1UL << order) - (pfn - free_page_pfn);
}
+out:
spin_unlock_irqrestore(&zone->lock, flags);
+ return ret;
}
/*
* A bad page could be due to a number of fields. Instead of multiple branches,
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c643c8420809..6021f8444b5a 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -300,7 +300,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before)
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
{
unsigned char saved_mt;
unsigned long start_pfn;
@@ -327,11 +327,16 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
zone->zone_start_pfn);
saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
- isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
- if (ret)
- return ret;
+ if (skip_isolation)
+ VM_BUG_ON(!is_migrate_isolate(saved_mt));
+ else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
+ isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+
+ if (ret)
+ return ret;
+ }
/*
* Bail out early when the to-be-isolated pageblock does not form
@@ -366,9 +371,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
if (PageBuddy(page)) {
int order = buddy_order(page);
- if (pfn + (1UL << order) > boundary_pfn)
- split_free_page(page, order, boundary_pfn - pfn);
- pfn += (1UL << order);
+ if (pfn + (1UL << order) > boundary_pfn) {
+ /* free page changed before split, check it again */
+ if (split_free_page(page, order, boundary_pfn - pfn))
+ continue;
+ }
+
+ pfn += 1UL << order;
continue;
}
/*
@@ -463,7 +472,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
return 0;
failed:
/* restore the original migratetype */
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ if (!skip_isolation)
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
return -EBUSY;
}
@@ -522,14 +532,18 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages);
unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages);
int ret;
+ bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false);
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
if (ret)
return ret;
+ if (isolate_start == isolate_end - pageblock_nr_pages)
+ skip_isolation = true;
+
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true);
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index da30c769b376..a6f565308133 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1174,6 +1174,10 @@ static int shmem_find_swap_entries(struct address_space *mapping,
continue;
entry = radix_to_swp_entry(folio);
+ /*
+ * swapin error entries can be found in the mapping. But they're
+ * deliberately ignored here as we've done everything we can do.
+ */
if (swp_type(entry) != type)
continue;
@@ -1671,6 +1675,36 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
return error;
}
+static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
+ struct folio *folio, swp_entry_t swap)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swapin_error;
+ void *old;
+
+ swapin_error = make_swapin_error_entry(&folio->page);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index,
+ swp_to_radix_entry(swap),
+ swp_to_radix_entry(swapin_error), 0);
+ if (old != swp_to_radix_entry(swap))
+ return;
+
+ folio_wait_writeback(folio);
+ delete_from_swap_cache(&folio->page);
+ spin_lock_irq(&info->lock);
+ /*
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
+ * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
+ * shmem_evict_inode.
+ */
+ info->alloced--;
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ swap_free(swap);
+}
+
/*
* Swap in the page pointed to by *pagep.
* Caller has to make sure that *pagep contains a valid swapped page.
@@ -1694,6 +1728,9 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
swap = radix_to_swp_entry(*foliop);
*foliop = NULL;
+ if (is_swapin_error_entry(swap))
+ return -EIO;
+
/* Look it up and read it in.. */
page = lookup_swap_cache(swap, NULL, 0);
if (!page) {
@@ -1761,6 +1798,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
+ if (error == -EIO)
+ shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
if (folio) {
folio_unlock(folio);
@@ -1906,7 +1945,7 @@ alloc_nohuge:
spin_lock_irq(&info->lock);
info->alloced += folio_nr_pages(folio);
- inode->i_blocks += BLOCKS_PER_PAGE << folio_order(folio);
+ inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
alloced = true;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b9e4ed2e90bf..778d57d2d92d 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -410,6 +410,9 @@ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index)
return NULL;
swp = radix_to_swp_entry(page);
+ /* There might be swapin error entries in shmem mapping. */
+ if (non_swap_entry(swp))
+ return NULL;
/* Prevent swapoff from happening to us */
si = get_swap_device(swp);
if (!si)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 94b4ff43ead0..a2e66d855b19 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1775,7 +1775,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
{
struct page *swapcache;
spinlock_t *ptl;
- pte_t *pte;
+ pte_t *pte, new_pte;
int ret = 1;
swapcache = page;
@@ -1789,6 +1789,17 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
goto out;
}
+ if (unlikely(!PageUptodate(page))) {
+ pte_t pteval;
+
+ dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+ pteval = swp_entry_to_pte(make_swapin_error_entry(page));
+ set_pte_at(vma->vm_mm, addr, pte, pteval);
+ swap_free(entry);
+ ret = 0;
+ goto out;
+ }
+
/* See do_swap_page() */
BUG_ON(!PageAnon(page) && PageMappedToDisk(page));
BUG_ON(PageAnon(page) && PageAnonExclusive(page));
@@ -1813,8 +1824,12 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
page_add_new_anon_rmap(page, vma, addr);
lru_cache_add_inactive_or_unevictable(page, vma);
}
- set_pte_at(vma->vm_mm, addr, pte,
- pte_mkold(mk_pte(page, vma->vm_page_prot)));
+ new_pte = pte_mkold(mk_pte(page, vma->vm_page_prot));
+ if (pte_swp_soft_dirty(*pte))
+ new_pte = pte_mksoft_dirty(new_pte);
+ if (pte_swp_uffd_wp(*pte))
+ new_pte = pte_mkuffd_wp(new_pte);
+ set_pte_at(vma->vm_mm, addr, pte, new_pte);
swap_free(entry);
out:
pte_unmap_unlock(pte, ptl);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 83b5a3514427..f41f8b0d9e9a 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -181,6 +181,7 @@ enum z3fold_page_flags {
NEEDS_COMPACTING,
PAGE_STALE,
PAGE_CLAIMED, /* by either reclaim or free */
+ PAGE_MIGRATED, /* page is migrated and soon to be released */
};
/*
@@ -212,10 +213,8 @@ static int size_to_chunks(size_t size)
static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
gfp_t gfp)
{
- struct z3fold_buddy_slots *slots;
-
- slots = kmem_cache_zalloc(pool->c_handle,
- (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
+ struct z3fold_buddy_slots *slots = kmem_cache_zalloc(pool->c_handle,
+ gfp);
if (slots) {
/* It will be freed separately in free_handle(). */
@@ -272,8 +271,13 @@ static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
locked = z3fold_page_trylock(zhdr);
read_unlock(&slots->lock);
- if (locked)
- break;
+ if (locked) {
+ struct page *page = virt_to_page(zhdr);
+
+ if (!test_bit(PAGE_MIGRATED, &page->private))
+ break;
+ z3fold_page_unlock(zhdr);
+ }
cpu_relax();
} while (true);
} else {
@@ -391,6 +395,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
+ clear_bit(PAGE_MIGRATED, &page->private);
if (headless)
return zhdr;
@@ -521,13 +526,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
atomic64_dec(&pool->pages_nr);
}
-static void release_z3fold_page(struct kref *ref)
-{
- struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
- refcount);
- __release_z3fold_page(zhdr, false);
-}
-
static void release_z3fold_page_locked(struct kref *ref)
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
@@ -940,10 +938,19 @@ lookup:
}
}
- if (zhdr && !zhdr->slots)
- zhdr->slots = alloc_slots(pool,
- can_sleep ? GFP_NOIO : GFP_ATOMIC);
+ if (zhdr && !zhdr->slots) {
+ zhdr->slots = alloc_slots(pool, GFP_ATOMIC);
+ if (!zhdr->slots)
+ goto out_fail;
+ }
return zhdr;
+
+out_fail:
+ if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ add_to_unbuddied(pool, zhdr);
+ z3fold_page_unlock(zhdr);
+ }
+ return NULL;
}
/*
@@ -1066,7 +1073,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
- if (!size)
+ if (!size || (gfp & __GFP_HIGHMEM))
return -EINVAL;
if (size > PAGE_SIZE)
@@ -1093,28 +1100,7 @@ retry:
bud = FIRST;
}
- page = NULL;
- if (can_sleep) {
- spin_lock(&pool->stale_lock);
- zhdr = list_first_entry_or_null(&pool->stale,
- struct z3fold_header, buddy);
- /*
- * Before allocating a page, let's see if we can take one from
- * the stale pages list. cancel_work_sync() can sleep so we
- * limit this case to the contexts where we can sleep
- */
- if (zhdr) {
- list_del(&zhdr->buddy);
- spin_unlock(&pool->stale_lock);
- cancel_work_sync(&zhdr->work);
- page = virt_to_page(zhdr);
- } else {
- spin_unlock(&pool->stale_lock);
- }
- }
- if (!page)
- page = alloc_page(gfp);
-
+ page = alloc_page(gfp);
if (!page)
return -ENOMEM;
@@ -1134,10 +1120,9 @@ retry:
__SetPageMovable(page, pool->inode->i_mapping);
unlock_page(page);
} else {
- if (trylock_page(page)) {
- __SetPageMovable(page, pool->inode->i_mapping);
- unlock_page(page);
- }
+ WARN_ON(!trylock_page(page));
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
}
z3fold_page_lock(zhdr);
@@ -1236,8 +1221,8 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
return;
}
if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
- put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
+ put_z3fold_header(zhdr);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -1332,12 +1317,7 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
break;
}
- if (kref_get_unless_zero(&zhdr->refcount) == 0) {
- zhdr = NULL;
- break;
- }
if (!z3fold_page_trylock(zhdr)) {
- kref_put(&zhdr->refcount, release_z3fold_page);
zhdr = NULL;
continue; /* can't evict at this point */
}
@@ -1348,14 +1328,14 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
*/
if (zhdr->foreign_handles ||
test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- if (!kref_put(&zhdr->refcount,
- release_z3fold_page_locked))
- z3fold_page_unlock(zhdr);
+ z3fold_page_unlock(zhdr);
zhdr = NULL;
continue; /* can't evict such page */
}
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
+ /* See comment in __z3fold_alloc. */
+ kref_get(&zhdr->refcount);
break;
}
@@ -1437,8 +1417,10 @@ next:
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
- z3fold_page_unlock(zhdr);
+ if (list_empty(&zhdr->buddy))
+ add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
}
/* We started off locked to we need to lock the pool back */
@@ -1590,8 +1572,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
if (!z3fold_page_trylock(zhdr))
return -EAGAIN;
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
- z3fold_page_unlock(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
+ z3fold_page_unlock(zhdr);
return -EBUSY;
}
if (work_pending(&zhdr->work)) {
@@ -1601,7 +1583,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
new_zhdr = page_address(newpage);
memcpy(new_zhdr, zhdr, PAGE_SIZE);
newpage->private = page->private;
- page->private = 0;
+ set_bit(PAGE_MIGRATED, &page->private);
z3fold_page_unlock(zhdr);
spin_lock_init(&new_zhdr->page_lock);
INIT_WORK(&new_zhdr->work, compact_page_work);
@@ -1631,7 +1613,8 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
- clear_bit(PAGE_CLAIMED, &page->private);
+ /* PAGE_CLAIMED and PAGE_MIGRATED are cleared now. */
+ page->private = 0;
put_page(page);
return 0;
}
@@ -1653,6 +1636,8 @@ static void z3fold_page_putback(struct page *page)
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ if (list_empty(&zhdr->buddy))
+ add_to_unbuddied(pool, zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
diff --git a/tools/testing/selftests/cgroup/memcg_protection.m b/tools/testing/selftests/cgroup/memcg_protection.m
new file mode 100644
index 000000000000..051daa3477b6
--- /dev/null
+++ b/tools/testing/selftests/cgroup/memcg_protection.m
@@ -0,0 +1,89 @@
+% SPDX-License-Identifier: GPL-2.0
+%
+% run as: octave-cli memcg_protection.m
+%
+% This script simulates reclaim protection behavior on a single level of memcg
+% hierarchy to illustrate how overcommitted protection spreads among siblings
+% (as it depends also on their current consumption).
+%
+% Simulation assumes siblings consumed the initial amount of memory (w/out
+% reclaim) and then the reclaim starts, all memory is reclaimable, i.e. treated
+% same. It simulates only non-low reclaim and assumes all memory.min = 0.
+%
+% Input configurations
+% --------------------
+% E number parent effective protection
+% n vector nominal protection of siblings set at the given level (memory.low)
+% c vector current consumption -,,- (memory.current)
+
+% example from testcase (values in GB)
+E = 50 / 1024;
+n = [75 25 0 500 ] / 1024;
+c = [50 50 50 0] / 1024;
+
+% Reclaim parameters
+% ------------------
+
+% Minimal reclaim amount (GB)
+cluster = 32*4 / 2**20;
+
+% Reclaim coefficient (think as 0.5^sc->priority)
+alpha = .1
+
+% Simulation parameters
+% ---------------------
+epsilon = 1e-7;
+timeout = 1000;
+
+% Simulation loop
+% ---------------
+
+ch = [];
+eh = [];
+rh = [];
+
+for t = 1:timeout
+ % low_usage
+ u = min(c, n);
+ siblings = sum(u);
+
+ % effective_protection()
+ protected = min(n, c); % start with nominal
+ e = protected * min(1, E / siblings); % normalize overcommit
+
+ % recursive protection
+ unclaimed = max(0, E - siblings);
+ parent_overuse = sum(c) - siblings;
+ if (unclaimed > 0 && parent_overuse > 0)
+ overuse = max(0, c - protected);
+ e += unclaimed * (overuse / parent_overuse);
+ endif
+
+ % get_scan_count()
+ r = alpha * c; % assume all memory is in a single LRU list
+
+ % commit 1bc63fb1272b ("mm, memcg: make scan aggression always exclude protection")
+ sz = max(e, c);
+ r .*= (1 - (e+epsilon) ./ (sz+epsilon));
+
+ % uncomment to debug prints
+ % e, c, r
+
+ % nothing to reclaim, reached equilibrium
+ if max(r) < epsilon
+ break;
+ endif
+
+ % SWAP_CLUSTER_MAX roundup
+ r = max(r, (r > epsilon) .* cluster);
+ % XXX here I do parallel reclaim of all siblings
+ % in reality reclaim is serialized and each sibling recalculates own residual
+ c = max(c - r, 0);
+
+ ch = [ch ; c];
+ eh = [eh ; e];
+ rh = [rh ; r];
+endfor
+
+t
+c, e
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c
index 44a974ec472c..8833359556f3 100644
--- a/tools/testing/selftests/cgroup/test_memcontrol.c
+++ b/tools/testing/selftests/cgroup/test_memcontrol.c
@@ -190,13 +190,6 @@ cleanup:
return ret;
}
-static int alloc_pagecache_50M(const char *cgroup, void *arg)
-{
- int fd = (long)arg;
-
- return alloc_pagecache(fd, MB(50));
-}
-
static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg)
{
int fd = (long)arg;
@@ -247,33 +240,39 @@ static int cg_test_proc_killed(const char *cgroup)
/*
* First, this test creates the following hierarchy:
- * A memory.min = 50M, memory.max = 200M
- * A/B memory.min = 50M, memory.current = 50M
+ * A memory.min = 0, memory.max = 200M
+ * A/B memory.min = 50M
* A/B/C memory.min = 75M, memory.current = 50M
* A/B/D memory.min = 25M, memory.current = 50M
* A/B/E memory.min = 0, memory.current = 50M
* A/B/F memory.min = 500M, memory.current = 0
*
- * Usages are pagecache, but the test keeps a running
+ * (or memory.low if we test soft protection)
+ *
+ * Usages are pagecache and the test keeps a running
* process in every leaf cgroup.
* Then it creates A/G and creates a significant
- * memory pressure in it.
+ * memory pressure in A.
*
+ * Then it checks actual memory usages and expects that:
* A/B memory.current ~= 50M
- * A/B/C memory.current ~= 33M
- * A/B/D memory.current ~= 17M
- * A/B/F memory.current ~= 0
+ * A/B/C memory.current ~= 29M
+ * A/B/D memory.current ~= 21M
+ * A/B/E memory.current ~= 0
+ * A/B/F memory.current = 0
+ * (for origin of the numbers, see model in memcg_protection.m.)
*
* After that it tries to allocate more than there is
- * unprotected memory in A available, and checks
- * checks that memory.min protects pagecache even
- * in this case.
+ * unprotected memory in A available, and checks that:
+ * a) memory.min protects pagecache even in this case,
+ * b) memory.low allows reclaiming page cache with low events.
*/
-static int test_memcg_min(const char *root)
+static int test_memcg_protection(const char *root, bool min)
{
- int ret = KSFT_FAIL;
+ int ret = KSFT_FAIL, rc;
char *parent[3] = {NULL};
char *children[4] = {NULL};
+ const char *attribute = min ? "memory.min" : "memory.low";
long c[4];
int i, attempts;
int fd;
@@ -297,8 +296,10 @@ static int test_memcg_min(const char *root)
if (cg_create(parent[0]))
goto cleanup;
- if (cg_read_long(parent[0], "memory.min")) {
- ret = KSFT_SKIP;
+ if (cg_read_long(parent[0], attribute)) {
+ /* No memory.min on older kernels is fine */
+ if (min)
+ ret = KSFT_SKIP;
goto cleanup;
}
@@ -335,17 +336,15 @@ static int test_memcg_min(const char *root)
(void *)(long)fd);
}
- if (cg_write(parent[0], "memory.min", "50M"))
+ if (cg_write(parent[1], attribute, "50M"))
goto cleanup;
- if (cg_write(parent[1], "memory.min", "50M"))
+ if (cg_write(children[0], attribute, "75M"))
goto cleanup;
- if (cg_write(children[0], "memory.min", "75M"))
+ if (cg_write(children[1], attribute, "25M"))
goto cleanup;
- if (cg_write(children[1], "memory.min", "25M"))
+ if (cg_write(children[2], attribute, "0"))
goto cleanup;
- if (cg_write(children[2], "memory.min", "0"))
- goto cleanup;
- if (cg_write(children[3], "memory.min", "500M"))
+ if (cg_write(children[3], attribute, "500M"))
goto cleanup;
attempts = 0;
@@ -365,170 +364,35 @@ static int test_memcg_min(const char *root)
for (i = 0; i < ARRAY_SIZE(children); i++)
c[i] = cg_read_long(children[i], "memory.current");
- if (!values_close(c[0], MB(33), 10))
+ if (!values_close(c[0], MB(29), 10))
goto cleanup;
- if (!values_close(c[1], MB(17), 10))
+ if (!values_close(c[1], MB(21), 10))
goto cleanup;
if (c[3] != 0)
goto cleanup;
- if (!cg_run(parent[2], alloc_anon, (void *)MB(170)))
- goto cleanup;
-
- if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
- goto cleanup;
-
- ret = KSFT_PASS;
-
-cleanup:
- for (i = ARRAY_SIZE(children) - 1; i >= 0; i--) {
- if (!children[i])
- continue;
-
- cg_destroy(children[i]);
- free(children[i]);
- }
-
- for (i = ARRAY_SIZE(parent) - 1; i >= 0; i--) {
- if (!parent[i])
- continue;
-
- cg_destroy(parent[i]);
- free(parent[i]);
- }
- close(fd);
- return ret;
-}
-
-/*
- * First, this test creates the following hierarchy:
- * A memory.low = 50M, memory.max = 200M
- * A/B memory.low = 50M, memory.current = 50M
- * A/B/C memory.low = 75M, memory.current = 50M
- * A/B/D memory.low = 25M, memory.current = 50M
- * A/B/E memory.low = 0, memory.current = 50M
- * A/B/F memory.low = 500M, memory.current = 0
- *
- * Usages are pagecache.
- * Then it creates A/G an creates a significant
- * memory pressure in it.
- *
- * Then it checks actual memory usages and expects that:
- * A/B memory.current ~= 50M
- * A/B/ memory.current ~= 33M
- * A/B/D memory.current ~= 17M
- * A/B/F memory.current ~= 0
- *
- * After that it tries to allocate more than there is
- * unprotected memory in A available,
- * and checks low and oom events in memory.events.
- */
-static int test_memcg_low(const char *root)
-{
- int ret = KSFT_FAIL;
- char *parent[3] = {NULL};
- char *children[4] = {NULL};
- long low, oom;
- long c[4];
- int i;
- int fd;
-
- fd = get_temp_fd();
- if (fd < 0)
- goto cleanup;
-
- parent[0] = cg_name(root, "memcg_test_0");
- if (!parent[0])
- goto cleanup;
-
- parent[1] = cg_name(parent[0], "memcg_test_1");
- if (!parent[1])
- goto cleanup;
-
- parent[2] = cg_name(parent[0], "memcg_test_2");
- if (!parent[2])
- goto cleanup;
-
- if (cg_create(parent[0]))
- goto cleanup;
-
- if (cg_read_long(parent[0], "memory.low"))
- goto cleanup;
-
- if (cg_write(parent[0], "cgroup.subtree_control", "+memory"))
- goto cleanup;
-
- if (cg_write(parent[0], "memory.max", "200M"))
- goto cleanup;
-
- if (cg_write(parent[0], "memory.swap.max", "0"))
- goto cleanup;
-
- if (cg_create(parent[1]))
- goto cleanup;
-
- if (cg_write(parent[1], "cgroup.subtree_control", "+memory"))
+ rc = cg_run(parent[2], alloc_anon, (void *)MB(170));
+ if (min && !rc)
goto cleanup;
-
- if (cg_create(parent[2]))
+ else if (!min && rc) {
+ fprintf(stderr,
+ "memory.low prevents from allocating anon memory\n");
goto cleanup;
-
- for (i = 0; i < ARRAY_SIZE(children); i++) {
- children[i] = cg_name_indexed(parent[1], "child_memcg", i);
- if (!children[i])
- goto cleanup;
-
- if (cg_create(children[i]))
- goto cleanup;
-
- if (i > 2)
- continue;
-
- if (cg_run(children[i], alloc_pagecache_50M, (void *)(long)fd))
- goto cleanup;
}
- if (cg_write(parent[0], "memory.low", "50M"))
- goto cleanup;
- if (cg_write(parent[1], "memory.low", "50M"))
- goto cleanup;
- if (cg_write(children[0], "memory.low", "75M"))
- goto cleanup;
- if (cg_write(children[1], "memory.low", "25M"))
- goto cleanup;
- if (cg_write(children[2], "memory.low", "0"))
- goto cleanup;
- if (cg_write(children[3], "memory.low", "500M"))
- goto cleanup;
-
- if (cg_run(parent[2], alloc_anon, (void *)MB(148)))
- goto cleanup;
-
if (!values_close(cg_read_long(parent[1], "memory.current"), MB(50), 3))
goto cleanup;
- for (i = 0; i < ARRAY_SIZE(children); i++)
- c[i] = cg_read_long(children[i], "memory.current");
-
- if (!values_close(c[0], MB(33), 10))
- goto cleanup;
-
- if (!values_close(c[1], MB(17), 10))
- goto cleanup;
-
- if (c[3] != 0)
- goto cleanup;
-
- if (cg_run(parent[2], alloc_anon, (void *)MB(166))) {
- fprintf(stderr,
- "memory.low prevents from allocating anon memory\n");
+ if (min) {
+ ret = KSFT_PASS;
goto cleanup;
}
for (i = 0; i < ARRAY_SIZE(children); i++) {
- int no_low_events_index = has_recursiveprot ? 2 : 1;
+ int no_low_events_index = 1;
+ long low, oom;
oom = cg_read_key_long(children[i], "memory.events", "oom ");
low = cg_read_key_long(children[i], "memory.events", "low ");
@@ -564,6 +428,16 @@ cleanup:
return ret;
}
+static int test_memcg_min(const char *root)
+{
+ return test_memcg_protection(root, true);
+}
+
+static int test_memcg_low(const char *root)
+{
+ return test_memcg_protection(root, false);
+}
+
static int alloc_pagecache_max_30M(const char *cgroup, void *arg)
{
size_t size = MB(50);
@@ -1241,7 +1115,16 @@ static int test_memcg_oom_group_leaf_events(const char *root)
if (cg_read_key_long(child, "memory.events", "oom_kill ") <= 0)
goto cleanup;
- if (cg_read_key_long(parent, "memory.events", "oom_kill ") <= 0)
+ parent_oom_events = cg_read_key_long(
+ parent, "memory.events", "oom_kill ");
+ /*
+ * If memory_localevents is not enabled (the default), the parent should
+ * count OOM events in its children groups. Otherwise, it should not
+ * have observed any events.
+ */
+ if (has_localevents && parent_oom_events != 0)
+ goto cleanup;
+ else if (!has_localevents && parent_oom_events <= 0)
goto cleanup;
ret = KSFT_PASS;
@@ -1349,20 +1232,14 @@ static int test_memcg_oom_group_score_events(const char *root)
if (!cg_run(memcg, alloc_anon, (void *)MB(100)))
goto cleanup;
- parent_oom_events = cg_read_key_long(
- parent, "memory.events", "oom_kill ");
- /*
- * If memory_localevents is not enabled (the default), the parent should
- * count OOM events in its children groups. Otherwise, it should not
- * have observed any events.
- */
- if ((has_localevents && parent_oom_events == 0) ||
- parent_oom_events > 0)
- ret = KSFT_PASS;
+ if (cg_read_key_long(memcg, "memory.events", "oom_kill ") != 3)
+ goto cleanup;
if (kill(safe_pid, SIGKILL))
goto cleanup;
+ ret = KSFT_PASS;
+
cleanup:
if (memcg)
cg_destroy(memcg);