From a0a44d9175b349df2462089140fb7f292100bd7c Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 27 May 2024 11:01:28 +0200 Subject: mm, slab: don't wrap internal functions with alloc_hooks() The functions __kmalloc_noprof(), kmalloc_large_noprof(), kmalloc_trace_noprof() and their _node variants are all internal to the implementations of kmalloc_noprof() and kmalloc_node_noprof() and are only declared in the "public" slab.h and exported so that those implementations can be static inline and distinguish the build-time constant size variants. The only other users for some of the internal functions are slub_kunit and fortify_kunit tests which make very short-lived allocations. Therefore we can stop wrapping them with the alloc_hooks() macro. Instead add a __ prefix to all of them and a comment documenting these as internal. Also rename __kmalloc_trace() to __kmalloc_cache() which is more descriptive - it is a variant of __kmalloc() where the exact kmalloc cache has been already determined. The usage in fortify_kunit can be removed completely, as the internal functions should be tested already through kmalloc() tests in the test variant that passes non-constant allocation size. Reported-by: Kent Overstreet Cc: Suren Baghdasaryan Cc: Kees Cook Reviewed-by: Kent Overstreet Acked-by: David Rientjes Signed-off-by: Vlastimil Babka --- mm/slub.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 0809760cf789..95e0a3332c44 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4053,7 +4053,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); * directly to the page allocator. We use __GFP_COMP, because we will need to * know the allocation order to free the pages properly in kfree. */ -static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) +static void *___kmalloc_large_node(size_t size, gfp_t flags, int node) { struct folio *folio; void *ptr = NULL; @@ -4078,25 +4078,25 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -void *kmalloc_large_noprof(size_t size, gfp_t flags) +void *__kmalloc_large_noprof(size_t size, gfp_t flags) { - void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE); trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_large_noprof); +EXPORT_SYMBOL(__kmalloc_large_noprof); -void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) +void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) { - void *ret = __kmalloc_large_node(size, flags, node); + void *ret = ___kmalloc_large_node(size, flags, node); trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_large_node_noprof); +EXPORT_SYMBOL(__kmalloc_large_node_noprof); static __always_inline void *__do_kmalloc_node(size_t size, gfp_t flags, int node, @@ -4106,7 +4106,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, void *ret; if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) { - ret = __kmalloc_large_node(size, flags, node); + ret = __kmalloc_large_node_noprof(size, flags, node); trace_kmalloc(caller, ret, size, PAGE_SIZE << get_order(size), flags, node); return ret; @@ -4142,7 +4142,7 @@ void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, } EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); -void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE, _RET_IP_, size); @@ -4152,10 +4152,10 @@ void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_trace_noprof); +EXPORT_SYMBOL(__kmalloc_cache_noprof); -void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) +void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size); @@ -4164,7 +4164,7 @@ void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_node_trace_noprof); +EXPORT_SYMBOL(__kmalloc_cache_node_noprof); static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, -- cgit v1.2.3 From 4d2bcefa965b06a1f2be6912456bcfa86a34f184 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 31 May 2024 13:29:02 +0100 Subject: mm: Reduce the number of slab->folio casts Mark a few more folio functions as taking a const folio pointer, which allows us to remove a few places in slab which cast away the const. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Vlastimil Babka --- mm/slab.h | 4 ++-- mm/slub.c | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index 5f8f47c5bee0..b16e63191578 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -166,7 +166,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t) */ static inline bool slab_test_pfmemalloc(const struct slab *slab) { - return folio_test_active((struct folio *)slab_folio(slab)); + return folio_test_active(slab_folio(slab)); } static inline void slab_set_pfmemalloc(struct slab *slab) @@ -211,7 +211,7 @@ static inline struct slab *virt_to_slab(const void *addr) static inline int slab_order(const struct slab *slab) { - return folio_order((struct folio *)slab_folio(slab)); + return folio_order(slab_folio(slab)); } static inline size_t slab_size(const struct slab *slab) diff --git a/mm/slub.c b/mm/slub.c index 95e0a3332c44..b8ba068ca079 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -962,11 +962,9 @@ void print_tracking(struct kmem_cache *s, void *object) static void print_slab_info(const struct slab *slab) { - struct folio *folio = (struct folio *)slab_folio(slab); - pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n", slab, slab->objects, slab->inuse, slab->freelist, - folio_flags(folio, 0)); + &slab->__page_flags); } /* @@ -2532,7 +2530,7 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab) */ static inline bool slab_test_node_partial(const struct slab *slab) { - return folio_test_workingset((struct folio *)slab_folio(slab)); + return folio_test_workingset(slab_folio(slab)); } static inline void slab_set_node_partial(struct slab *slab) -- cgit v1.2.3 From 47d911b02cbe61494bb066ad84cc66d25091d506 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 7 Jun 2024 16:40:12 +0800 Subject: slab: make check_object() more consistent Now check_object() calls check_bytes_and_report() multiple times to check every section of the object it cares about, like left and right redzones, object poison, paddings poison and freepointer. It will abort the checking process and return 0 once it finds an error. There are two inconsistencies in check_object(), which are alignment padding checking and object padding checking. We only print the error messages but don't return 0 to tell callers that something is wrong and needs to be handled. Please see alloc_debug_processing() and free_debug_processing() for details. We want to do all checks without skipping, so use a local variable "ret" to save each check result and change check_bytes_and_report() to only report specific error findings. Then at end of check_object(), print the trailer once if any found an error. Suggested-by: Vlastimil Babka Signed-off-by: Chengming Zhou Reviewed-by: Vlastimil Babka Signed-off-by: Vlastimil Babka --- mm/slub.c | 62 +++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index b8ba068ca079..5df8c8302784 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -788,8 +788,24 @@ static bool slab_add_kunit_errors(void) kunit_put_resource(resource); return true; } + +static bool slab_in_kunit_test(void) +{ + struct kunit_resource *resource; + + if (!kunit_get_current_test()) + return false; + + resource = kunit_find_named_resource(current->kunit_test, "slab_errors"); + if (!resource) + return false; + + kunit_put_resource(resource); + return true; +} #else static inline bool slab_add_kunit_errors(void) { return false; } +static inline bool slab_in_kunit_test(void) { return false; } #endif static inline unsigned int size_from_object(struct kmem_cache *s) @@ -1190,8 +1206,6 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab, pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault - addr, fault[0], value); - print_trailer(s, slab, object); - add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); skip_bug_print: restore_bytes(s, what, value, fault, end); @@ -1300,15 +1314,16 @@ static int check_object(struct kmem_cache *s, struct slab *slab, u8 *p = object; u8 *endobject = object + s->object_size; unsigned int orig_size, kasan_meta_size; + int ret = 1; if (s->flags & SLAB_RED_ZONE) { if (!check_bytes_and_report(s, slab, object, "Left Redzone", object - s->red_left_pad, val, s->red_left_pad)) - return 0; + ret = 0; if (!check_bytes_and_report(s, slab, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) - return 0; + ret = 0; if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) { orig_size = get_orig_size(s, object); @@ -1317,14 +1332,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab, !check_bytes_and_report(s, slab, object, "kmalloc Redzone", p + orig_size, val, s->object_size - orig_size)) { - return 0; + ret = 0; } } } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { - check_bytes_and_report(s, slab, p, "Alignment padding", + if (!check_bytes_and_report(s, slab, p, "Alignment padding", endobject, POISON_INUSE, - s->inuse - s->object_size); + s->inuse - s->object_size)) + ret = 0; } } @@ -1340,27 +1356,25 @@ static int check_object(struct kmem_cache *s, struct slab *slab, !check_bytes_and_report(s, slab, p, "Poison", p + kasan_meta_size, POISON_FREE, s->object_size - kasan_meta_size - 1)) - return 0; + ret = 0; if (kasan_meta_size < s->object_size && !check_bytes_and_report(s, slab, p, "End Poison", p + s->object_size - 1, POISON_END, 1)) - return 0; + ret = 0; } /* * check_pad_bytes cleans up on its own. */ - check_pad_bytes(s, slab, p); + if (!check_pad_bytes(s, slab, p)) + ret = 0; } - if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE) - /* - * Object and freepointer overlap. Cannot check - * freepointer while object is allocated. - */ - return 1; - - /* Check free pointer validity */ - if (!check_valid_pointer(s, slab, get_freepointer(s, p))) { + /* + * Cannot check freepointer while object is allocated if + * object and freepointer overlap. + */ + if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) && + !check_valid_pointer(s, slab, get_freepointer(s, p))) { object_err(s, slab, p, "Freepointer corrupt"); /* * No choice but to zap it and thus lose the remainder @@ -1368,9 +1382,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab, * another error because the object count is now wrong. */ set_freepointer(s, p, NULL); - return 0; + ret = 0; } - return 1; + + if (!ret && !slab_in_kunit_test()) { + print_trailer(s, slab, object); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + } + + return ret; } static int check_slab(struct kmem_cache *s, struct slab *slab) -- cgit v1.2.3 From adef2aeaa2b936c97865d56c59be2cb7266acbb7 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 7 Jun 2024 16:40:13 +0800 Subject: slab: don't put freepointer outside of object if only orig_size The commit 946fa0dbf2d8 ("mm/slub: extend redzone check to extra allocated kmalloc space than requested") will extend right redzone when allocating for orig_size < object_size. So we can't overlay the freepointer in the object space in this case. But the code looks like it forgot to check SLAB_RED_ZONE, since there won't be extended right redzone if only orig_size enabled. As we are here, make this complex conditional expressions a little prettier and add some comments about extending right redzone when slub_debug_orig_size() enabled. Reviewed-by: Feng Tang Reviewed-by: Vlastimil Babka Signed-off-by: Chengming Zhou Signed-off-by: Vlastimil Babka --- mm/slub.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5df8c8302784..5fc052a4f38f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5167,10 +5167,9 @@ static int calculate_sizes(struct kmem_cache *s) */ s->inuse = size; - if (slub_debug_orig_size(s) || - (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || - ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || - s->ctor) { + if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor || + ((flags & SLAB_RED_ZONE) && + (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on @@ -5178,7 +5177,9 @@ static int calculate_sizes(struct kmem_cache *s) * * This is the case if we do RCU, have a constructor or * destructor, are poisoning the objects, or are - * redzoning an object smaller than sizeof(void *). + * redzoning an object smaller than sizeof(void *) or are + * redzoning an object with slub_debug_orig_size() enabled, + * in which case the right redzone may be extended. * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the -- cgit v1.2.3 From 4a24bbabc826eaba6f94a9741712117b8e2b0587 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Fri, 7 Jun 2024 16:40:14 +0800 Subject: slab: delete useless RED_INACTIVE and RED_ACTIVE These seem useless since we use the SLUB_RED_INACTIVE and SLUB_RED_ACTIVE, so just delete them, no functional change. Reviewed-by: Vlastimil Babka Signed-off-by: Chengming Zhou Reviewed-by: Christoph Lameter (Ampere) Signed-off-by: Vlastimil Babka --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5fc052a4f38f..3d19a0ee411f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1228,8 +1228,8 @@ skip_bug_print: * Padding is extended by another word if Redzoning is enabled and * object_size == inuse. * - * We fill with 0xbb (RED_INACTIVE) for inactive objects and with - * 0xcc (RED_ACTIVE) for objects in use. + * We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with + * 0xcc (SLUB_RED_ACTIVE) for objects in use. * * object + s->inuse * Meta data starts here. -- cgit v1.2.3 From ad59baa3169591e0b4cf1a217c9139f2145f4c7f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 3 Jul 2024 09:25:21 +0200 Subject: slab, rust: extend kmalloc() alignment guarantees to remove Rust padding Slab allocators have been guaranteeing natural alignment for power-of-two sizes since commit 59bb47985c1d ("mm, sl[aou]b: guarantee natural alignment for kmalloc(power-of-two)"), while any other sizes are guaranteed to be aligned only to ARCH_KMALLOC_MINALIGN bytes (although in practice are aligned more than that in non-debug scenarios). Rust's allocator API specifies size and alignment per allocation, which have to satisfy the following rules, per Alice Ryhl [1]: 1. The alignment is a power of two. 2. The size is non-zero. 3. When you round up the size to the next multiple of the alignment, then it must not overflow the signed type isize / ssize_t. In order to map this to kmalloc()'s guarantees, some requested allocation sizes have to be padded to the next power-of-two size [2]. For example, an allocation of size 96 and alignment of 32 will be padded to an allocation of size 128, because the existing kmalloc-96 bucket doesn't guarantee alignent above ARCH_KMALLOC_MINALIGN. Without slab debugging active, the layout of the kmalloc-96 slabs however naturally align the objects to 32 bytes, so extending the size to 128 bytes is wasteful. To improve the situation we can extend the kmalloc() alignment guarantees in a way that 1) doesn't change the current slab layout (and thus does not increase internal fragmentation) when slab debugging is not active 2) reduces waste in the Rust allocator use case 3) is a superset of the current guarantee for power-of-two sizes. The extended guarantee is that alignment is at least the largest power-of-two divisor of the requested size. For power-of-two sizes the largest divisor is the size itself, but let's keep this case documented separately for clarity. For current kmalloc size buckets, it means kmalloc-96 will guarantee alignment of 32 bytes and kmalloc-196 will guarantee 64 bytes. This covers the rules 1 and 2 above of Rust's API as long as the size is a multiple of the alignment. The Rust layer should now only need to round up the size to the next multiple if it isn't, while enforcing the rule 3. Implementation-wise, this changes the alignment calculation in create_boot_cache(). While at it also do the calulation only for caches with the SLAB_KMALLOC flag, because the function is also used to create the initial kmem_cache and kmem_cache_node caches, where no alignment guarantee is necessary. In the Rust allocator's krealloc_aligned(), remove the code that padded sizes to the next power of two (suggested by Alice Ryhl) as it's no longer necessary with the new guarantees. Reported-by: Alice Ryhl Reported-by: Boqun Feng Link: https://lore.kernel.org/all/CAH5fLggjrbdUuT-H-5vbQfMazjRDpp2%2Bk3%3DYhPyS17ezEqxwcw@mail.gmail.com/ [1] Link: https://lore.kernel.org/all/CAH5fLghsZRemYUwVvhk77o6y1foqnCeDzW4WZv6ScEWna2+_jw@mail.gmail.com/ [2] Reviewed-by: Boqun Feng Acked-by: Roman Gushchin Reviewed-by: Alice Ryhl Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 1560a1546bb1..7272ef7bc55f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -617,11 +617,12 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, s->size = s->object_size = size; /* - * For power of two sizes, guarantee natural alignment for kmalloc - * caches, regardless of SL*B debugging options. + * kmalloc caches guarantee alignment of at least the largest + * power-of-two divisor of the size. For power-of-two sizes, + * it is the size itself. */ - if (is_power_of_2(size)) - align = max(align, size); + if (flags & SLAB_KMALLOC) + align = max(align, 1U << (ffs(size) - 1)); s->align = calculate_alignment(flags, align, size); #ifdef CONFIG_HARDENED_USERCOPY -- cgit v1.2.3 From 72e0fe2241ce113cbba339ca8c2450b167774530 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:12:58 -0700 Subject: mm/slab: Introduce kmem_buckets typedef Encapsulate the concept of a single set of kmem_caches that are used for the kmalloc size buckets. Redefine kmalloc_caches as an array of these buckets (for the different global cache buckets). Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index 7272ef7bc55f..ff60f91e4edc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -654,8 +654,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name, return s; } -struct kmem_cache * -kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init = +kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init = { /* initialization for https://llvm.org/pr42570 */ }; EXPORT_SYMBOL(kmalloc_caches); -- cgit v1.2.3 From 67f2df3b82d091ed095d0e47e1f3a9d3e18e4e41 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:12:59 -0700 Subject: mm/slab: Plumb kmem_buckets into __do_kmalloc_node() Introduce CONFIG_SLAB_BUCKETS which provides the infrastructure to support separated kmalloc buckets (in the following kmem_buckets_create() patches and future codetag-based separation). Since this will provide a mitigation for a very common case of exploits, it is recommended to enable this feature for general purpose distros. By default, the new Kconfig will be enabled if CONFIG_SLAB_FREELIST_HARDENED is enabled (and it is added to the hardening.config Kconfig fragment). To be able to choose which buckets to allocate from, make the buckets available to the internal kmalloc interfaces by adding them as the second argument, rather than depending on the buckets being chosen from the fixed set of global buckets. Where the bucket is not available, pass NULL, which means "use the default system kmalloc bucket set" (the prior existing behavior), as implemented in kmalloc_slab(). To avoid adding the extra argument when !CONFIG_SLAB_BUCKETS, only the top-level macros and static inlines use the buckets argument (where they are stripped out and compiled out respectively). The actual extern functions can then be built without the argument, and the internals fall back to the global kmalloc buckets unconditionally. Co-developed-by: Vlastimil Babka Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- mm/Kconfig | 17 +++++++++++++++++ mm/slab.h | 6 ++++-- mm/slab_common.c | 2 +- mm/slub.c | 20 ++++++++++---------- 4 files changed, 32 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index b4cb45255a54..e0dfb268717c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -273,6 +273,23 @@ config SLAB_FREELIST_HARDENED sacrifices to harden the kernel slab allocator against common freelist exploit methods. +config SLAB_BUCKETS + bool "Support allocation from separate kmalloc buckets" + depends on !SLUB_TINY + default SLAB_FREELIST_HARDENED + help + Kernel heap attacks frequently depend on being able to create + specifically-sized allocations with user-controlled contents + that will be allocated into the same kmalloc bucket as a + target object. To avoid sharing these allocation buckets, + provide an explicitly separated set of buckets to be used for + user-controlled allocations. This may very slightly increase + memory fragmentation, though in practice it's only a handful + of extra pages since the bulk of user-controlled allocations + are relatively long-lived. + + If unsure, say Y. + config SLUB_STATS default n bool "Enable performance statistics" diff --git a/mm/slab.h b/mm/slab.h index b16e63191578..d5e8034af9d5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -403,16 +403,18 @@ static inline unsigned int size_index_elem(unsigned int bytes) * KMALLOC_MAX_CACHE_SIZE and the caller must check that. */ static inline struct kmem_cache * -kmalloc_slab(size_t size, gfp_t flags, unsigned long caller) +kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller) { unsigned int index; + if (!b) + b = &kmalloc_caches[kmalloc_type(flags, caller)]; if (size <= 192) index = kmalloc_size_index[size_index_elem(size)]; else index = fls(size - 1); - return kmalloc_caches[kmalloc_type(flags, caller)][index]; + return (*b)[index]; } gfp_t kmalloc_fix_flags(gfp_t flags); diff --git a/mm/slab_common.c b/mm/slab_common.c index ff60f91e4edc..bcc1e13d7f86 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -703,7 +703,7 @@ size_t kmalloc_size_roundup(size_t size) * The flags don't matter since size_index is common to all. * Neither does the caller for just getting ->object_size. */ - return kmalloc_slab(size, GFP_KERNEL, 0)->object_size; + return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size; } /* Above the smaller buckets, size is a multiple of page size. */ diff --git a/mm/slub.c b/mm/slub.c index 3d19a0ee411f..80f0a51242d1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4117,7 +4117,7 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) EXPORT_SYMBOL(__kmalloc_large_node_noprof); static __always_inline -void *__do_kmalloc_node(size_t size, gfp_t flags, int node, +void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node, unsigned long caller) { struct kmem_cache *s; @@ -4133,32 +4133,32 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, if (unlikely(!size)) return ZERO_SIZE_PTR; - s = kmalloc_slab(size, flags, caller); + s = kmalloc_slab(size, b, flags, caller); ret = slab_alloc_node(s, NULL, flags, node, caller, size); ret = kasan_kmalloc(s, ret, size, flags); trace_kmalloc(caller, ret, size, s->size, flags, node); return ret; } - -void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) +void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { - return __do_kmalloc_node(size, flags, node, _RET_IP_); + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_node_noprof); void *__kmalloc_noprof(size_t size, gfp_t flags) { - return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); + return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_); } EXPORT_SYMBOL(__kmalloc_noprof); -void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, - int node, unsigned long caller) +void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, + int node, unsigned long caller) { - return __do_kmalloc_node(size, flags, node, caller); + return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller); + } -EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); +EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof); void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { -- cgit v1.2.3 From 2e8000b826fcd2716449d09753d5ed843067881e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:13:00 -0700 Subject: mm/slab: Introduce kvmalloc_buckets_node() that can take kmem_buckets argument Plumb kmem_buckets arguments through kvmalloc_node_noprof() so it is possible to provide an API to perform kvmalloc-style allocations with a particular set of buckets. Introduce kvmalloc_buckets_node() that takes a kmem_buckets argument. Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- mm/util.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index c9e519e6811f..28c5356b9f1c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -594,9 +594,10 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, EXPORT_SYMBOL(vm_mmap); /** - * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. + * @b: which set of kmalloc buckets to allocate from. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. * @node: numa node to allocate from * @@ -609,7 +610,7 @@ EXPORT_SYMBOL(vm_mmap); * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) +void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; @@ -631,7 +632,7 @@ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) kmalloc_flags &= ~__GFP_NOFAIL; } - ret = kmalloc_node_noprof(size, kmalloc_flags, node); + ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page @@ -660,7 +661,7 @@ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(kvmalloc_node_noprof); +EXPORT_SYMBOL(__kvmalloc_node_noprof); /** * kvfree() - Free memory. -- cgit v1.2.3 From b32801d1255be1da62ea8134df3ed9f3331fba12 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:13:01 -0700 Subject: mm/slab: Introduce kmem_buckets_create() and family Dedicated caches are available for fixed size allocations via kmem_cache_alloc(), but for dynamically sized allocations there is only the global kmalloc API's set of buckets available. This means it isn't possible to separate specific sets of dynamically sized allocations into a separate collection of caches. This leads to a use-after-free exploitation weakness in the Linux kernel since many heap memory spraying/grooming attacks depend on using userspace-controllable dynamically sized allocations to collide with fixed size allocations that end up in same cache. While CONFIG_RANDOM_KMALLOC_CACHES provides a probabilistic defense against these kinds of "type confusion" attacks, including for fixed same-size heap objects, we can create a complementary deterministic defense for dynamically sized allocations that are directly user controlled. Addressing these cases is limited in scope, so isolating these kinds of interfaces will not become an unbounded game of whack-a-mole. For example, many pass through memdup_user(), making isolation there very effective. In order to isolate user-controllable dynamically-sized allocations from the common system kmalloc allocations, introduce kmem_buckets_create(), which behaves like kmem_cache_create(). Introduce kmem_buckets_alloc(), which behaves like kmem_cache_alloc(). Introduce kmem_buckets_alloc_track_caller() for where caller tracking is needed. Introduce kmem_buckets_valloc() for cases where vmalloc fallback is needed. Note that these caches are specifically flagged with SLAB_NO_MERGE, since merging would defeat the entire purpose of the mitigation. This can also be used in the future to extend allocation profiling's use of code tagging to implement per-caller allocation cache isolation[1] even for dynamic allocations. Memory allocation pinning[2] is still needed to plug the Use-After-Free cross-allocator weakness (where attackers can arrange to free an entire slab page and have it reallocated to a different cache), but that is an existing and separate issue which is complementary to this improvement. Development continues for that feature via the SLAB_VIRTUAL[3] series (which could also provide guard pages -- another complementary improvement). Link: https://lore.kernel.org/lkml/202402211449.401382D2AF@keescook [1] Link: https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html [2] Link: https://lore.kernel.org/lkml/20230915105933.495735-1-matteorizzo@google.com/ [3] Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- mm/slab_common.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) (limited to 'mm') diff --git a/mm/slab_common.c b/mm/slab_common.c index bcc1e13d7f86..70943a4c1c4b 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -392,6 +392,98 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, } EXPORT_SYMBOL(kmem_cache_create); +static struct kmem_cache *kmem_buckets_cache __ro_after_init; + +/** + * kmem_buckets_create - Create a set of caches that handle dynamic sized + * allocations via kmem_buckets_alloc() + * @name: A prefix string which is used in /proc/slabinfo to identify this + * cache. The individual caches with have their sizes as the suffix. + * @flags: SLAB flags (see kmem_cache_create() for details). + * @useroffset: Starting offset within an allocation that may be copied + * to/from userspace. + * @usersize: How many bytes, starting at @useroffset, may be copied + * to/from userspace. + * @ctor: A constructor for the objects, run when new allocations are made. + * + * Cannot be called within an interrupt, but can be interrupted. + * + * Return: a pointer to the cache on success, NULL on failure. When + * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and + * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc(). + * (i.e. callers only need to check for NULL on failure.) + */ +kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags, + unsigned int useroffset, + unsigned int usersize, + void (*ctor)(void *)) +{ + kmem_buckets *b; + int idx; + + /* + * When the separate buckets API is not built in, just return + * a non-NULL value for the kmem_buckets pointer, which will be + * unused when performing allocations. + */ + if (!IS_ENABLED(CONFIG_SLAB_BUCKETS)) + return ZERO_SIZE_PTR; + + if (WARN_ON(!kmem_buckets_cache)) + return NULL; + + b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO); + if (WARN_ON(!b)) + return NULL; + + flags |= SLAB_NO_MERGE; + + for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) { + char *short_size, *cache_name; + unsigned int cache_useroffset, cache_usersize; + unsigned int size; + + if (!kmalloc_caches[KMALLOC_NORMAL][idx]) + continue; + + size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size; + if (!size) + continue; + + short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-'); + if (WARN_ON(!short_size)) + goto fail; + + cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1); + if (WARN_ON(!cache_name)) + goto fail; + + if (useroffset >= size) { + cache_useroffset = 0; + cache_usersize = 0; + } else { + cache_useroffset = useroffset; + cache_usersize = min(size - cache_useroffset, usersize); + } + (*b)[idx] = kmem_cache_create_usercopy(cache_name, size, + 0, flags, cache_useroffset, + cache_usersize, ctor); + kfree(cache_name); + if (WARN_ON(!(*b)[idx])) + goto fail; + } + + return b; + +fail: + for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) + kmem_cache_destroy((*b)[idx]); + kfree(b); + + return NULL; +} +EXPORT_SYMBOL(kmem_buckets_create); + #ifdef SLAB_SUPPORTS_SYSFS /* * For a given kmem_cache, kmem_cache_destroy() should only be called @@ -932,6 +1024,11 @@ void __init create_kmalloc_caches(void) /* Kmalloc array is now usable */ slab_state = UP; + + if (IS_ENABLED(CONFIG_SLAB_BUCKETS)) + kmem_buckets_cache = kmem_cache_create("kmalloc_buckets", + sizeof(kmem_buckets), + 0, SLAB_NO_MERGE, NULL); } /** -- cgit v1.2.3 From d73778e4b86755d527a0c6b249cde846770b2f66 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 1 Jul 2024 12:13:03 -0700 Subject: mm/util: Use dedicated slab buckets for memdup_user() Both memdup_user() and vmemdup_user() handle allocations that are regularly used for exploiting use-after-free type confusion flaws in the kernel (e.g. prctl() PR_SET_VMA_ANON_NAME[1] and setxattr[2][3][4] respectively). Since both are designed for contents coming from userspace, it allows for userspace-controlled allocation sizes. Use a dedicated set of kmalloc buckets so these allocations do not share caches with the global kmalloc buckets. After a fresh boot under Ubuntu 23.10, we can see the caches are already in active use: # grep ^memdup /proc/slabinfo memdup_user-8k 4 4 8192 4 8 : ... memdup_user-4k 8 8 4096 8 8 : ... memdup_user-2k 16 16 2048 16 8 : ... memdup_user-1k 0 0 1024 16 4 : ... memdup_user-512 0 0 512 16 2 : ... memdup_user-256 0 0 256 16 1 : ... memdup_user-128 0 0 128 32 1 : ... memdup_user-64 256 256 64 64 1 : ... memdup_user-32 512 512 32 128 1 : ... memdup_user-16 1024 1024 16 256 1 : ... memdup_user-8 2048 2048 8 512 1 : ... memdup_user-192 0 0 192 21 1 : ... memdup_user-96 168 168 96 42 1 : ... Link: https://starlabs.sg/blog/2023/07-prctl-anon_vma_name-an-amusing-heap-spray/ [1] Link: https://duasynt.com/blog/linux-kernel-heap-spray [2] Link: https://etenal.me/archives/1336 [3] Link: https://github.com/a13xp0p0v/kernel-hack-drill/blob/master/drill_exploit_uaf.c [4] Signed-off-by: Kees Cook Signed-off-by: Vlastimil Babka --- mm/util.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/util.c b/mm/util.c index 28c5356b9f1c..29189f48ee04 100644 --- a/mm/util.c +++ b/mm/util.c @@ -198,6 +198,16 @@ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp) } EXPORT_SYMBOL(kmemdup_nul); +static kmem_buckets *user_buckets __ro_after_init; + +static int __init init_user_buckets(void) +{ + user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL); + + return 0; +} +subsys_initcall(init_user_buckets); + /** * memdup_user - duplicate memory region from user space * @@ -211,7 +221,7 @@ void *memdup_user(const void __user *src, size_t len) { void *p; - p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN); + p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN); if (!p) return ERR_PTR(-ENOMEM); @@ -237,7 +247,7 @@ void *vmemdup_user(const void __user *src, size_t len) { void *p; - p = kvmalloc(len, GFP_USER); + p = kmem_buckets_valloc(user_buckets, len, GFP_USER); if (!p) return ERR_PTR(-ENOMEM); -- cgit v1.2.3