From a0a44d9175b349df2462089140fb7f292100bd7c Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Mon, 27 May 2024 11:01:28 +0200
Subject: mm, slab: don't wrap internal functions with alloc_hooks()

The functions __kmalloc_noprof(), kmalloc_large_noprof(),
kmalloc_trace_noprof() and their _node variants are all internal to the
implementations of kmalloc_noprof() and kmalloc_node_noprof() and are
only declared in the "public" slab.h and exported so that those
implementations can be static inline and distinguish the build-time
constant size variants. The only other users for some of the internal
functions are slub_kunit and fortify_kunit tests which make very
short-lived allocations.

Therefore we can stop wrapping them with the alloc_hooks() macro.
Instead add a __ prefix to all of them and a comment documenting these
as internal. Also rename __kmalloc_trace() to __kmalloc_cache() which is
more descriptive - it is a variant of __kmalloc() where the exact
kmalloc cache has been already determined.

The usage in fortify_kunit can be removed completely, as the internal
functions should be tested already through kmalloc() tests in the
test variant that passes non-constant allocation size.

Reported-by: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Kees Cook <keescook@chromium.org>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 0809760cf789..95e0a3332c44 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4053,7 +4053,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_noprof);
  * directly to the page allocator. We use __GFP_COMP, because we will need to
  * know the allocation order to free the pages properly in kfree.
  */
-static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
+static void *___kmalloc_large_node(size_t size, gfp_t flags, int node)
 {
 	struct folio *folio;
 	void *ptr = NULL;
@@ -4078,25 +4078,25 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node)
 	return ptr;
 }
 
-void *kmalloc_large_noprof(size_t size, gfp_t flags)
+void *__kmalloc_large_noprof(size_t size, gfp_t flags)
 {
-	void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE);
+	void *ret = ___kmalloc_large_node(size, flags, NUMA_NO_NODE);
 
 	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
 		      flags, NUMA_NO_NODE);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_large_noprof);
+EXPORT_SYMBOL(__kmalloc_large_noprof);
 
-void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
+void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
 {
-	void *ret = __kmalloc_large_node(size, flags, node);
+	void *ret = ___kmalloc_large_node(size, flags, node);
 
 	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << get_order(size),
 		      flags, node);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_large_node_noprof);
+EXPORT_SYMBOL(__kmalloc_large_node_noprof);
 
 static __always_inline
 void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
@@ -4106,7 +4106,7 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
 	void *ret;
 
 	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
-		ret = __kmalloc_large_node(size, flags, node);
+		ret = __kmalloc_large_node_noprof(size, flags, node);
 		trace_kmalloc(caller, ret, size,
 			      PAGE_SIZE << get_order(size), flags, node);
 		return ret;
@@ -4142,7 +4142,7 @@ void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags,
 }
 EXPORT_SYMBOL(kmalloc_node_track_caller_noprof);
 
-void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
+void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, NUMA_NO_NODE,
 					    _RET_IP_, size);
@@ -4152,10 +4152,10 @@ void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_trace_noprof);
+EXPORT_SYMBOL(__kmalloc_cache_noprof);
 
-void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
-			 int node, size_t size)
+void *__kmalloc_cache_node_noprof(struct kmem_cache *s, gfp_t gfpflags,
+				  int node, size_t size)
 {
 	void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, size);
 
@@ -4164,7 +4164,7 @@ void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags,
 	ret = kasan_kmalloc(s, ret, size, gfpflags);
 	return ret;
 }
-EXPORT_SYMBOL(kmalloc_node_trace_noprof);
+EXPORT_SYMBOL(__kmalloc_cache_node_noprof);
 
 static noinline void free_to_partial_list(
 	struct kmem_cache *s, struct slab *slab,
-- 
cgit v1.2.3


From 4d2bcefa965b06a1f2be6912456bcfa86a34f184 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 31 May 2024 13:29:02 +0100
Subject: mm: Reduce the number of slab->folio casts

Mark a few more folio functions as taking a const folio pointer, which
allows us to remove a few places in slab which cast away the const.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab.h | 4 ++--
 mm/slub.c | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.h b/mm/slab.h
index 5f8f47c5bee0..b16e63191578 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -166,7 +166,7 @@ static_assert(IS_ALIGNED(offsetof(struct slab, freelist), sizeof(freelist_aba_t)
  */
 static inline bool slab_test_pfmemalloc(const struct slab *slab)
 {
-	return folio_test_active((struct folio *)slab_folio(slab));
+	return folio_test_active(slab_folio(slab));
 }
 
 static inline void slab_set_pfmemalloc(struct slab *slab)
@@ -211,7 +211,7 @@ static inline struct slab *virt_to_slab(const void *addr)
 
 static inline int slab_order(const struct slab *slab)
 {
-	return folio_order((struct folio *)slab_folio(slab));
+	return folio_order(slab_folio(slab));
 }
 
 static inline size_t slab_size(const struct slab *slab)
diff --git a/mm/slub.c b/mm/slub.c
index 95e0a3332c44..b8ba068ca079 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -962,11 +962,9 @@ void print_tracking(struct kmem_cache *s, void *object)
 
 static void print_slab_info(const struct slab *slab)
 {
-	struct folio *folio = (struct folio *)slab_folio(slab);
-
 	pr_err("Slab 0x%p objects=%u used=%u fp=0x%p flags=%pGp\n",
 	       slab, slab->objects, slab->inuse, slab->freelist,
-	       folio_flags(folio, 0));
+	       &slab->__page_flags);
 }
 
 /*
@@ -2532,7 +2530,7 @@ static void discard_slab(struct kmem_cache *s, struct slab *slab)
  */
 static inline bool slab_test_node_partial(const struct slab *slab)
 {
-	return folio_test_workingset((struct folio *)slab_folio(slab));
+	return folio_test_workingset(slab_folio(slab));
 }
 
 static inline void slab_set_node_partial(struct slab *slab)
-- 
cgit v1.2.3


From 47d911b02cbe61494bb066ad84cc66d25091d506 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <chengming.zhou@linux.dev>
Date: Fri, 7 Jun 2024 16:40:12 +0800
Subject: slab: make check_object() more consistent

Now check_object() calls check_bytes_and_report() multiple times to
check every section of the object it cares about, like left and right
redzones, object poison, paddings poison and freepointer. It will
abort the checking process and return 0 once it finds an error.

There are two inconsistencies in check_object(), which are alignment
padding checking and object padding checking. We only print the error
messages but don't return 0 to tell callers that something is wrong
and needs to be handled. Please see alloc_debug_processing() and
free_debug_processing() for details.

We want to do all checks without skipping, so use a local variable
"ret" to save each check result and change check_bytes_and_report() to
only report specific error findings. Then at end of check_object(),
print the trailer once if any found an error.

Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 62 +++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index b8ba068ca079..5df8c8302784 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -788,8 +788,24 @@ static bool slab_add_kunit_errors(void)
 	kunit_put_resource(resource);
 	return true;
 }
+
+static bool slab_in_kunit_test(void)
+{
+	struct kunit_resource *resource;
+
+	if (!kunit_get_current_test())
+		return false;
+
+	resource = kunit_find_named_resource(current->kunit_test, "slab_errors");
+	if (!resource)
+		return false;
+
+	kunit_put_resource(resource);
+	return true;
+}
 #else
 static inline bool slab_add_kunit_errors(void) { return false; }
+static inline bool slab_in_kunit_test(void) { return false; }
 #endif
 
 static inline unsigned int size_from_object(struct kmem_cache *s)
@@ -1190,8 +1206,6 @@ static int check_bytes_and_report(struct kmem_cache *s, struct slab *slab,
 	pr_err("0x%p-0x%p @offset=%tu. First byte 0x%x instead of 0x%x\n",
 					fault, end - 1, fault - addr,
 					fault[0], value);
-	print_trailer(s, slab, object);
-	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 
 skip_bug_print:
 	restore_bytes(s, what, value, fault, end);
@@ -1300,15 +1314,16 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
 	u8 *p = object;
 	u8 *endobject = object + s->object_size;
 	unsigned int orig_size, kasan_meta_size;
+	int ret = 1;
 
 	if (s->flags & SLAB_RED_ZONE) {
 		if (!check_bytes_and_report(s, slab, object, "Left Redzone",
 			object - s->red_left_pad, val, s->red_left_pad))
-			return 0;
+			ret = 0;
 
 		if (!check_bytes_and_report(s, slab, object, "Right Redzone",
 			endobject, val, s->inuse - s->object_size))
-			return 0;
+			ret = 0;
 
 		if (slub_debug_orig_size(s) && val == SLUB_RED_ACTIVE) {
 			orig_size = get_orig_size(s, object);
@@ -1317,14 +1332,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
 				!check_bytes_and_report(s, slab, object,
 					"kmalloc Redzone", p + orig_size,
 					val, s->object_size - orig_size)) {
-				return 0;
+				ret = 0;
 			}
 		}
 	} else {
 		if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
-			check_bytes_and_report(s, slab, p, "Alignment padding",
+			if (!check_bytes_and_report(s, slab, p, "Alignment padding",
 				endobject, POISON_INUSE,
-				s->inuse - s->object_size);
+				s->inuse - s->object_size))
+				ret = 0;
 		}
 	}
 
@@ -1340,27 +1356,25 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
 			    !check_bytes_and_report(s, slab, p, "Poison",
 					p + kasan_meta_size, POISON_FREE,
 					s->object_size - kasan_meta_size - 1))
-				return 0;
+				ret = 0;
 			if (kasan_meta_size < s->object_size &&
 			    !check_bytes_and_report(s, slab, p, "End Poison",
 					p + s->object_size - 1, POISON_END, 1))
-				return 0;
+				ret = 0;
 		}
 		/*
 		 * check_pad_bytes cleans up on its own.
 		 */
-		check_pad_bytes(s, slab, p);
+		if (!check_pad_bytes(s, slab, p))
+			ret = 0;
 	}
 
-	if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
-		/*
-		 * Object and freepointer overlap. Cannot check
-		 * freepointer while object is allocated.
-		 */
-		return 1;
-
-	/* Check free pointer validity */
-	if (!check_valid_pointer(s, slab, get_freepointer(s, p))) {
+	/*
+	 * Cannot check freepointer while object is allocated if
+	 * object and freepointer overlap.
+	 */
+	if ((freeptr_outside_object(s) || val != SLUB_RED_ACTIVE) &&
+	    !check_valid_pointer(s, slab, get_freepointer(s, p))) {
 		object_err(s, slab, p, "Freepointer corrupt");
 		/*
 		 * No choice but to zap it and thus lose the remainder
@@ -1368,9 +1382,15 @@ static int check_object(struct kmem_cache *s, struct slab *slab,
 		 * another error because the object count is now wrong.
 		 */
 		set_freepointer(s, p, NULL);
-		return 0;
+		ret = 0;
 	}
-	return 1;
+
+	if (!ret && !slab_in_kunit_test()) {
+		print_trailer(s, slab, object);
+		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	}
+
+	return ret;
 }
 
 static int check_slab(struct kmem_cache *s, struct slab *slab)
-- 
cgit v1.2.3


From adef2aeaa2b936c97865d56c59be2cb7266acbb7 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <chengming.zhou@linux.dev>
Date: Fri, 7 Jun 2024 16:40:13 +0800
Subject: slab: don't put freepointer outside of object if only orig_size

The commit 946fa0dbf2d8 ("mm/slub: extend redzone check to extra
allocated kmalloc space than requested") will extend right redzone
when allocating for orig_size < object_size. So we can't overlay the
freepointer in the object space in this case.

But the code looks like it forgot to check SLAB_RED_ZONE, since there
won't be extended right redzone if only orig_size enabled.

As we are here, make this complex conditional expressions a little
prettier and add some comments about extending right redzone when
slub_debug_orig_size() enabled.

Reviewed-by: Feng Tang <feng.tang@intel.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Chengming Zhou <chengming.zhou@linux.dev>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 5df8c8302784..5fc052a4f38f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -5167,10 +5167,9 @@ static int calculate_sizes(struct kmem_cache *s)
 	 */
 	s->inuse = size;
 
-	if (slub_debug_orig_size(s) ||
-	    (flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
-	    ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
-	    s->ctor) {
+	if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor ||
+	    ((flags & SLAB_RED_ZONE) &&
+	     (s->object_size < sizeof(void *) || slub_debug_orig_size(s)))) {
 		/*
 		 * Relocate free pointer after the object if it is not
 		 * permitted to overwrite the first word of the object on
@@ -5178,7 +5177,9 @@ static int calculate_sizes(struct kmem_cache *s)
 		 *
 		 * This is the case if we do RCU, have a constructor or
 		 * destructor, are poisoning the objects, or are
-		 * redzoning an object smaller than sizeof(void *).
+		 * redzoning an object smaller than sizeof(void *) or are
+		 * redzoning an object with slub_debug_orig_size() enabled,
+		 * in which case the right redzone may be extended.
 		 *
 		 * The assumption that s->offset >= s->inuse means free
 		 * pointer is outside of the object is used in the
-- 
cgit v1.2.3


From 4a24bbabc826eaba6f94a9741712117b8e2b0587 Mon Sep 17 00:00:00 2001
From: Chengming Zhou <chengming.zhou@linux.dev>
Date: Fri, 7 Jun 2024 16:40:14 +0800
Subject: slab: delete useless RED_INACTIVE and RED_ACTIVE

These seem useless since we use the SLUB_RED_INACTIVE and SLUB_RED_ACTIVE,
so just delete them, no functional change.

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Chengming Zhou <chengming.zhou@linux.dev>
Reviewed-by: Christoph Lameter (Ampere) <cl@linux.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slub.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 5fc052a4f38f..3d19a0ee411f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1228,8 +1228,8 @@ skip_bug_print:
  * 	Padding is extended by another word if Redzoning is enabled and
  * 	object_size == inuse.
  *
- * 	We fill with 0xbb (RED_INACTIVE) for inactive objects and with
- * 	0xcc (RED_ACTIVE) for objects in use.
+ * 	We fill with 0xbb (SLUB_RED_INACTIVE) for inactive objects and with
+ * 	0xcc (SLUB_RED_ACTIVE) for objects in use.
  *
  * object + s->inuse
  * 	Meta data starts here.
-- 
cgit v1.2.3


From ad59baa3169591e0b4cf1a217c9139f2145f4c7f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 3 Jul 2024 09:25:21 +0200
Subject: slab, rust: extend kmalloc() alignment guarantees to remove Rust
 padding

Slab allocators have been guaranteeing natural alignment for
power-of-two sizes since commit 59bb47985c1d ("mm, sl[aou]b: guarantee
natural alignment for kmalloc(power-of-two)"), while any other sizes are
guaranteed to be aligned only to ARCH_KMALLOC_MINALIGN bytes (although
in practice are aligned more than that in non-debug scenarios).

Rust's allocator API specifies size and alignment per allocation, which
have to satisfy the following rules, per Alice Ryhl [1]:

  1. The alignment is a power of two.
  2. The size is non-zero.
  3. When you round up the size to the next multiple of the alignment,
     then it must not overflow the signed type isize / ssize_t.

In order to map this to kmalloc()'s guarantees, some requested
allocation sizes have to be padded to the next power-of-two size [2].
For example, an allocation of size 96 and alignment of 32 will be padded
to an allocation of size 128, because the existing kmalloc-96 bucket
doesn't guarantee alignent above ARCH_KMALLOC_MINALIGN. Without slab
debugging active, the layout of the kmalloc-96 slabs however naturally
align the objects to 32 bytes, so extending the size to 128 bytes is
wasteful.

To improve the situation we can extend the kmalloc() alignment
guarantees in a way that

1) doesn't change the current slab layout (and thus does not increase
   internal fragmentation) when slab debugging is not active
2) reduces waste in the Rust allocator use case
3) is a superset of the current guarantee for power-of-two sizes.

The extended guarantee is that alignment is at least the largest
power-of-two divisor of the requested size. For power-of-two sizes the
largest divisor is the size itself, but let's keep this case documented
separately for clarity.

For current kmalloc size buckets, it means kmalloc-96 will guarantee
alignment of 32 bytes and kmalloc-196 will guarantee 64 bytes.

This covers the rules 1 and 2 above of Rust's API as long as the size is
a multiple of the alignment. The Rust layer should now only need to
round up the size to the next multiple if it isn't, while enforcing the
rule 3.

Implementation-wise, this changes the alignment calculation in
create_boot_cache(). While at it also do the calulation only for caches
with the SLAB_KMALLOC flag, because the function is also used to create
the initial kmem_cache and kmem_cache_node caches, where no alignment
guarantee is necessary.

In the Rust allocator's krealloc_aligned(), remove the code that padded
sizes to the next power of two (suggested by Alice Ryhl) as it's no
longer necessary with the new guarantees.

Reported-by: Alice Ryhl <aliceryhl@google.com>
Reported-by: Boqun Feng <boqun.feng@gmail.com>
Link: https://lore.kernel.org/all/CAH5fLggjrbdUuT-H-5vbQfMazjRDpp2%2Bk3%3DYhPyS17ezEqxwcw@mail.gmail.com/ [1]
Link: https://lore.kernel.org/all/CAH5fLghsZRemYUwVvhk77o6y1foqnCeDzW4WZv6ScEWna2+_jw@mail.gmail.com/ [2]
Reviewed-by: Boqun Feng <boqun.feng@gmail.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Reviewed-by: Alice Ryhl <aliceryhl@google.com>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 1560a1546bb1..7272ef7bc55f 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -617,11 +617,12 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
 	s->size = s->object_size = size;
 
 	/*
-	 * For power of two sizes, guarantee natural alignment for kmalloc
-	 * caches, regardless of SL*B debugging options.
+	 * kmalloc caches guarantee alignment of at least the largest
+	 * power-of-two divisor of the size. For power-of-two sizes,
+	 * it is the size itself.
 	 */
-	if (is_power_of_2(size))
-		align = max(align, size);
+	if (flags & SLAB_KMALLOC)
+		align = max(align, 1U << (ffs(size) - 1));
 	s->align = calculate_alignment(flags, align, size);
 
 #ifdef CONFIG_HARDENED_USERCOPY
-- 
cgit v1.2.3


From 72e0fe2241ce113cbba339ca8c2450b167774530 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 1 Jul 2024 12:12:58 -0700
Subject: mm/slab: Introduce kmem_buckets typedef

Encapsulate the concept of a single set of kmem_caches that are used
for the kmalloc size buckets. Redefine kmalloc_caches as an array
of these buckets (for the different global cache buckets).

Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index 7272ef7bc55f..ff60f91e4edc 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -654,8 +654,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
 	return s;
 }
 
-struct kmem_cache *
-kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
+kmem_buckets kmalloc_caches[NR_KMALLOC_TYPES] __ro_after_init =
 { /* initialization for https://llvm.org/pr42570 */ };
 EXPORT_SYMBOL(kmalloc_caches);
 
-- 
cgit v1.2.3


From 67f2df3b82d091ed095d0e47e1f3a9d3e18e4e41 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 1 Jul 2024 12:12:59 -0700
Subject: mm/slab: Plumb kmem_buckets into __do_kmalloc_node()

Introduce CONFIG_SLAB_BUCKETS which provides the infrastructure to
support separated kmalloc buckets (in the following kmem_buckets_create()
patches and future codetag-based separation). Since this will provide
a mitigation for a very common case of exploits, it is recommended to
enable this feature for general purpose distros. By default, the new
Kconfig will be enabled if CONFIG_SLAB_FREELIST_HARDENED is enabled (and
it is added to the hardening.config Kconfig fragment).

To be able to choose which buckets to allocate from, make the buckets
available to the internal kmalloc interfaces by adding them as the
second argument, rather than depending on the buckets being chosen from
the fixed set of global buckets. Where the bucket is not available,
pass NULL, which means "use the default system kmalloc bucket set"
(the prior existing behavior), as implemented in kmalloc_slab().

To avoid adding the extra argument when !CONFIG_SLAB_BUCKETS, only the
top-level macros and static inlines use the buckets argument (where
they are stripped out and compiled out respectively). The actual extern
functions can then be built without the argument, and the internals
fall back to the global kmalloc buckets unconditionally.

Co-developed-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/Kconfig       | 17 +++++++++++++++++
 mm/slab.h        |  6 ++++--
 mm/slab_common.c |  2 +-
 mm/slub.c        | 20 ++++++++++----------
 4 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index b4cb45255a54..e0dfb268717c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -273,6 +273,23 @@ config SLAB_FREELIST_HARDENED
 	  sacrifices to harden the kernel slab allocator against common
 	  freelist exploit methods.
 
+config SLAB_BUCKETS
+	bool "Support allocation from separate kmalloc buckets"
+	depends on !SLUB_TINY
+	default SLAB_FREELIST_HARDENED
+	help
+	  Kernel heap attacks frequently depend on being able to create
+	  specifically-sized allocations with user-controlled contents
+	  that will be allocated into the same kmalloc bucket as a
+	  target object. To avoid sharing these allocation buckets,
+	  provide an explicitly separated set of buckets to be used for
+	  user-controlled allocations. This may very slightly increase
+	  memory fragmentation, though in practice it's only a handful
+	  of extra pages since the bulk of user-controlled allocations
+	  are relatively long-lived.
+
+	  If unsure, say Y.
+
 config SLUB_STATS
 	default n
 	bool "Enable performance statistics"
diff --git a/mm/slab.h b/mm/slab.h
index b16e63191578..d5e8034af9d5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -403,16 +403,18 @@ static inline unsigned int size_index_elem(unsigned int bytes)
  * KMALLOC_MAX_CACHE_SIZE and the caller must check that.
  */
 static inline struct kmem_cache *
-kmalloc_slab(size_t size, gfp_t flags, unsigned long caller)
+kmalloc_slab(size_t size, kmem_buckets *b, gfp_t flags, unsigned long caller)
 {
 	unsigned int index;
 
+	if (!b)
+		b = &kmalloc_caches[kmalloc_type(flags, caller)];
 	if (size <= 192)
 		index = kmalloc_size_index[size_index_elem(size)];
 	else
 		index = fls(size - 1);
 
-	return kmalloc_caches[kmalloc_type(flags, caller)][index];
+	return (*b)[index];
 }
 
 gfp_t kmalloc_fix_flags(gfp_t flags);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ff60f91e4edc..bcc1e13d7f86 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -703,7 +703,7 @@ size_t kmalloc_size_roundup(size_t size)
 		 * The flags don't matter since size_index is common to all.
 		 * Neither does the caller for just getting ->object_size.
 		 */
-		return kmalloc_slab(size, GFP_KERNEL, 0)->object_size;
+		return kmalloc_slab(size, NULL, GFP_KERNEL, 0)->object_size;
 	}
 
 	/* Above the smaller buckets, size is a multiple of page size. */
diff --git a/mm/slub.c b/mm/slub.c
index 3d19a0ee411f..80f0a51242d1 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4117,7 +4117,7 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
 EXPORT_SYMBOL(__kmalloc_large_node_noprof);
 
 static __always_inline
-void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
+void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
 			unsigned long caller)
 {
 	struct kmem_cache *s;
@@ -4133,32 +4133,32 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node,
 	if (unlikely(!size))
 		return ZERO_SIZE_PTR;
 
-	s = kmalloc_slab(size, flags, caller);
+	s = kmalloc_slab(size, b, flags, caller);
 
 	ret = slab_alloc_node(s, NULL, flags, node, caller, size);
 	ret = kasan_kmalloc(s, ret, size, flags);
 	trace_kmalloc(caller, ret, size, s->size, flags, node);
 	return ret;
 }
-
-void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node)
+void *__kmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
 {
-	return __do_kmalloc_node(size, flags, node, _RET_IP_);
+	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc_node_noprof);
 
 void *__kmalloc_noprof(size_t size, gfp_t flags)
 {
-	return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_);
+	return __do_kmalloc_node(size, NULL, flags, NUMA_NO_NODE, _RET_IP_);
 }
 EXPORT_SYMBOL(__kmalloc_noprof);
 
-void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags,
-				       int node, unsigned long caller)
+void *__kmalloc_node_track_caller_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags,
+					 int node, unsigned long caller)
 {
-	return __do_kmalloc_node(size, flags, node, caller);
+	return __do_kmalloc_node(size, PASS_BUCKET_PARAM(b), flags, node, caller);
+
 }
-EXPORT_SYMBOL(kmalloc_node_track_caller_noprof);
+EXPORT_SYMBOL(__kmalloc_node_track_caller_noprof);
 
 void *__kmalloc_cache_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-- 
cgit v1.2.3


From 2e8000b826fcd2716449d09753d5ed843067881e Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 1 Jul 2024 12:13:00 -0700
Subject: mm/slab: Introduce kvmalloc_buckets_node() that can take kmem_buckets
 argument

Plumb kmem_buckets arguments through kvmalloc_node_noprof() so it is
possible to provide an API to perform kvmalloc-style allocations with
a particular set of buckets. Introduce kvmalloc_buckets_node() that takes a
kmem_buckets argument.

Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/util.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/util.c b/mm/util.c
index c9e519e6811f..28c5356b9f1c 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -594,9 +594,10 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 EXPORT_SYMBOL(vm_mmap);
 
 /**
- * kvmalloc_node - attempt to allocate physically contiguous memory, but upon
+ * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
  * failure, fall back to non-contiguous (vmalloc) allocation.
  * @size: size of the request.
+ * @b: which set of kmalloc buckets to allocate from.
  * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL.
  * @node: numa node to allocate from
  *
@@ -609,7 +610,7 @@ EXPORT_SYMBOL(vm_mmap);
  *
  * Return: pointer to the allocated memory of %NULL in case of failure
  */
-void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
+void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
 {
 	gfp_t kmalloc_flags = flags;
 	void *ret;
@@ -631,7 +632,7 @@ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
 		kmalloc_flags &= ~__GFP_NOFAIL;
 	}
 
-	ret = kmalloc_node_noprof(size, kmalloc_flags, node);
+	ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node);
 
 	/*
 	 * It doesn't really make sense to fallback to vmalloc for sub page
@@ -660,7 +661,7 @@ void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node)
 			flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP,
 			node, __builtin_return_address(0));
 }
-EXPORT_SYMBOL(kvmalloc_node_noprof);
+EXPORT_SYMBOL(__kvmalloc_node_noprof);
 
 /**
  * kvfree() - Free memory.
-- 
cgit v1.2.3


From b32801d1255be1da62ea8134df3ed9f3331fba12 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 1 Jul 2024 12:13:01 -0700
Subject: mm/slab: Introduce kmem_buckets_create() and family

Dedicated caches are available for fixed size allocations via
kmem_cache_alloc(), but for dynamically sized allocations there is only
the global kmalloc API's set of buckets available. This means it isn't
possible to separate specific sets of dynamically sized allocations into
a separate collection of caches.

This leads to a use-after-free exploitation weakness in the Linux
kernel since many heap memory spraying/grooming attacks depend on using
userspace-controllable dynamically sized allocations to collide with
fixed size allocations that end up in same cache.

While CONFIG_RANDOM_KMALLOC_CACHES provides a probabilistic defense
against these kinds of "type confusion" attacks, including for fixed
same-size heap objects, we can create a complementary deterministic
defense for dynamically sized allocations that are directly user
controlled. Addressing these cases is limited in scope, so isolating these
kinds of interfaces will not become an unbounded game of whack-a-mole. For
example, many pass through memdup_user(), making isolation there very
effective.

In order to isolate user-controllable dynamically-sized
allocations from the common system kmalloc allocations, introduce
kmem_buckets_create(), which behaves like kmem_cache_create(). Introduce
kmem_buckets_alloc(), which behaves like kmem_cache_alloc(). Introduce
kmem_buckets_alloc_track_caller() for where caller tracking is
needed. Introduce kmem_buckets_valloc() for cases where vmalloc fallback
is needed. Note that these caches are specifically flagged with
SLAB_NO_MERGE, since merging would defeat the entire purpose of the
mitigation.

This can also be used in the future to extend allocation profiling's use
of code tagging to implement per-caller allocation cache isolation[1]
even for dynamic allocations.

Memory allocation pinning[2] is still needed to plug the Use-After-Free
cross-allocator weakness (where attackers can arrange to free an
entire slab page and have it reallocated to a different cache),
but that is an existing and separate issue which is complementary
to this improvement. Development continues for that feature via the
SLAB_VIRTUAL[3] series (which could also provide guard pages -- another
complementary improvement).

Link: https://lore.kernel.org/lkml/202402211449.401382D2AF@keescook [1]
Link: https://googleprojectzero.blogspot.com/2021/10/how-simple-linux-kernel-memory.html [2]
Link: https://lore.kernel.org/lkml/20230915105933.495735-1-matteorizzo@google.com/ [3]
Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/slab_common.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

(limited to 'mm')

diff --git a/mm/slab_common.c b/mm/slab_common.c
index bcc1e13d7f86..70943a4c1c4b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -392,6 +392,98 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align,
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
+static struct kmem_cache *kmem_buckets_cache __ro_after_init;
+
+/**
+ * kmem_buckets_create - Create a set of caches that handle dynamic sized
+ *			 allocations via kmem_buckets_alloc()
+ * @name: A prefix string which is used in /proc/slabinfo to identify this
+ *	  cache. The individual caches with have their sizes as the suffix.
+ * @flags: SLAB flags (see kmem_cache_create() for details).
+ * @useroffset: Starting offset within an allocation that may be copied
+ *		to/from userspace.
+ * @usersize: How many bytes, starting at @useroffset, may be copied
+ *		to/from userspace.
+ * @ctor: A constructor for the objects, run when new allocations are made.
+ *
+ * Cannot be called within an interrupt, but can be interrupted.
+ *
+ * Return: a pointer to the cache on success, NULL on failure. When
+ * CONFIG_SLAB_BUCKETS is not enabled, ZERO_SIZE_PTR is returned, and
+ * subsequent calls to kmem_buckets_alloc() will fall back to kmalloc().
+ * (i.e. callers only need to check for NULL on failure.)
+ */
+kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,
+				  unsigned int useroffset,
+				  unsigned int usersize,
+				  void (*ctor)(void *))
+{
+	kmem_buckets *b;
+	int idx;
+
+	/*
+	 * When the separate buckets API is not built in, just return
+	 * a non-NULL value for the kmem_buckets pointer, which will be
+	 * unused when performing allocations.
+	 */
+	if (!IS_ENABLED(CONFIG_SLAB_BUCKETS))
+		return ZERO_SIZE_PTR;
+
+	if (WARN_ON(!kmem_buckets_cache))
+		return NULL;
+
+	b = kmem_cache_alloc(kmem_buckets_cache, GFP_KERNEL|__GFP_ZERO);
+	if (WARN_ON(!b))
+		return NULL;
+
+	flags |= SLAB_NO_MERGE;
+
+	for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++) {
+		char *short_size, *cache_name;
+		unsigned int cache_useroffset, cache_usersize;
+		unsigned int size;
+
+		if (!kmalloc_caches[KMALLOC_NORMAL][idx])
+			continue;
+
+		size = kmalloc_caches[KMALLOC_NORMAL][idx]->object_size;
+		if (!size)
+			continue;
+
+		short_size = strchr(kmalloc_caches[KMALLOC_NORMAL][idx]->name, '-');
+		if (WARN_ON(!short_size))
+			goto fail;
+
+		cache_name = kasprintf(GFP_KERNEL, "%s-%s", name, short_size + 1);
+		if (WARN_ON(!cache_name))
+			goto fail;
+
+		if (useroffset >= size) {
+			cache_useroffset = 0;
+			cache_usersize = 0;
+		} else {
+			cache_useroffset = useroffset;
+			cache_usersize = min(size - cache_useroffset, usersize);
+		}
+		(*b)[idx] = kmem_cache_create_usercopy(cache_name, size,
+					0, flags, cache_useroffset,
+					cache_usersize, ctor);
+		kfree(cache_name);
+		if (WARN_ON(!(*b)[idx]))
+			goto fail;
+	}
+
+	return b;
+
+fail:
+	for (idx = 0; idx < ARRAY_SIZE(kmalloc_caches[KMALLOC_NORMAL]); idx++)
+		kmem_cache_destroy((*b)[idx]);
+	kfree(b);
+
+	return NULL;
+}
+EXPORT_SYMBOL(kmem_buckets_create);
+
 #ifdef SLAB_SUPPORTS_SYSFS
 /*
  * For a given kmem_cache, kmem_cache_destroy() should only be called
@@ -932,6 +1024,11 @@ void __init create_kmalloc_caches(void)
 
 	/* Kmalloc array is now usable */
 	slab_state = UP;
+
+	if (IS_ENABLED(CONFIG_SLAB_BUCKETS))
+		kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
+						       sizeof(kmem_buckets),
+						       0, SLAB_NO_MERGE, NULL);
 }
 
 /**
-- 
cgit v1.2.3


From d73778e4b86755d527a0c6b249cde846770b2f66 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@kernel.org>
Date: Mon, 1 Jul 2024 12:13:03 -0700
Subject: mm/util: Use dedicated slab buckets for memdup_user()

Both memdup_user() and vmemdup_user() handle allocations that are
regularly used for exploiting use-after-free type confusion flaws in
the kernel (e.g. prctl() PR_SET_VMA_ANON_NAME[1] and setxattr[2][3][4]
respectively).

Since both are designed for contents coming from userspace, it allows
for userspace-controlled allocation sizes. Use a dedicated set of kmalloc
buckets so these allocations do not share caches with the global kmalloc
buckets.

After a fresh boot under Ubuntu 23.10, we can see the caches are already
in active use:

 # grep ^memdup /proc/slabinfo
 memdup_user-8k         4      4   8192    4    8 : ...
 memdup_user-4k         8      8   4096    8    8 : ...
 memdup_user-2k        16     16   2048   16    8 : ...
 memdup_user-1k         0      0   1024   16    4 : ...
 memdup_user-512        0      0    512   16    2 : ...
 memdup_user-256        0      0    256   16    1 : ...
 memdup_user-128        0      0    128   32    1 : ...
 memdup_user-64       256    256     64   64    1 : ...
 memdup_user-32       512    512     32  128    1 : ...
 memdup_user-16      1024   1024     16  256    1 : ...
 memdup_user-8       2048   2048      8  512    1 : ...
 memdup_user-192        0      0    192   21    1 : ...
 memdup_user-96       168    168     96   42    1 : ...

Link: https://starlabs.sg/blog/2023/07-prctl-anon_vma_name-an-amusing-heap-spray/ [1]
Link: https://duasynt.com/blog/linux-kernel-heap-spray [2]
Link: https://etenal.me/archives/1336 [3]
Link: https://github.com/a13xp0p0v/kernel-hack-drill/blob/master/drill_exploit_uaf.c [4]
Signed-off-by: Kees Cook <kees@kernel.org>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/util.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/util.c b/mm/util.c
index 28c5356b9f1c..29189f48ee04 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -198,6 +198,16 @@ char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
 }
 EXPORT_SYMBOL(kmemdup_nul);
 
+static kmem_buckets *user_buckets __ro_after_init;
+
+static int __init init_user_buckets(void)
+{
+	user_buckets = kmem_buckets_create("memdup_user", 0, 0, INT_MAX, NULL);
+
+	return 0;
+}
+subsys_initcall(init_user_buckets);
+
 /**
  * memdup_user - duplicate memory region from user space
  *
@@ -211,7 +221,7 @@ void *memdup_user(const void __user *src, size_t len)
 {
 	void *p;
 
-	p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
+	p = kmem_buckets_alloc_track_caller(user_buckets, len, GFP_USER | __GFP_NOWARN);
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
@@ -237,7 +247,7 @@ void *vmemdup_user(const void __user *src, size_t len)
 {
 	void *p;
 
-	p = kvmalloc(len, GFP_USER);
+	p = kmem_buckets_valloc(user_buckets, len, GFP_USER);
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3