From 543585cc5b07fa99a2dc897159fbf48c1eb73058 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 18 Oct 2011 22:09:24 -0700 Subject: slab: rename slab_break_gfp_order to slab_max_order slab_break_gfp_order is more appropriately named slab_max_order since it enforces the maximum order size of slabs as long as a single object will still fit. Also rename BREAK_GFP_ORDER_{LO,HI} accordingly. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 708efe886154..1a482e8402c4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -481,9 +481,9 @@ EXPORT_SYMBOL(slab_buffer_size); /* * Do not go above this order unless 0 objects fit into the slab. */ -#define BREAK_GFP_ORDER_HI 1 -#define BREAK_GFP_ORDER_LO 0 -static int slab_break_gfp_order = BREAK_GFP_ORDER_LO; +#define SLAB_MAX_ORDER_HI 1 +#define SLAB_MAX_ORDER_LO 0 +static int slab_max_order = SLAB_MAX_ORDER_LO; /* * Functions for storing/retrieving the cachep and or slab from the page @@ -1502,7 +1502,7 @@ void __init kmem_cache_init(void) * page orders on machines with more than 32MB of memory. */ if (totalram_pages > (32 << 20) >> PAGE_SHIFT) - slab_break_gfp_order = BREAK_GFP_ORDER_HI; + slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated * from caches that do not exist yet: @@ -2112,7 +2112,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * Large number of objects is good, but very large slabs are * currently bad for the gfp()s. */ - if (gfporder >= slab_break_gfp_order) + if (gfporder >= slab_max_order) break; /* -- cgit v1.2.3 From 3df1cccdfb3fab6aa9176beb655d802eb384eabc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 18 Oct 2011 22:09:28 -0700 Subject: slab: introduce slab_max_order kernel parameter Introduce new slab_max_order kernel parameter which is the equivalent of slub_max_order. For immediate purposes, allows users to override the heuristic that sets the max order to 1 by default if they have more than 32MB of RAM. This may result in page allocation failures if there is substantial fragmentation. Another usecase would be to increase the max order for better performance. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slab.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 1a482e8402c4..b0414d12fd08 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -479,11 +479,13 @@ EXPORT_SYMBOL(slab_buffer_size); #endif /* - * Do not go above this order unless 0 objects fit into the slab. + * Do not go above this order unless 0 objects fit into the slab or + * overridden on the command line. */ #define SLAB_MAX_ORDER_HI 1 #define SLAB_MAX_ORDER_LO 0 static int slab_max_order = SLAB_MAX_ORDER_LO; +static bool slab_max_order_set __initdata; /* * Functions for storing/retrieving the cachep and or slab from the page @@ -851,6 +853,17 @@ static int __init noaliencache_setup(char *s) } __setup("noaliencache", noaliencache_setup); +static int __init slab_max_order_setup(char *str) +{ + get_option(&str, &slab_max_order); + slab_max_order = slab_max_order < 0 ? 0 : + min(slab_max_order, MAX_ORDER - 1); + slab_max_order_set = true; + + return 1; +} +__setup("slab_max_order=", slab_max_order_setup); + #ifdef CONFIG_NUMA /* * Special reaping functions for NUMA systems called from cache_reap(). @@ -1499,9 +1512,10 @@ void __init kmem_cache_init(void) /* * Fragmentation resistance on low memory - only use bigger - * page orders on machines with more than 32MB of memory. + * page orders on machines with more than 32MB of memory if + * not overridden on the command line. */ - if (totalram_pages > (32 << 20) >> PAGE_SHIFT) + if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_max_order = SLAB_MAX_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated -- cgit v1.2.3 From 265d47e7115023df9e2b7a864b207b4738d9e18c Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 15 Nov 2011 15:04:00 -0800 Subject: slub: add taint flag outputting to debug paths When we get corruption reports, it's useful to see if the kernel was tainted, to rule out problems we can't do anything about. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 7d2a996c307e..60552d52f847 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -570,7 +570,7 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_end(args); printk(KERN_ERR "========================================" "=====================================\n"); - printk(KERN_ERR "BUG %s: %s\n", s->name, buf); + printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); printk(KERN_ERR "----------------------------------------" "-------------------------------------\n\n"); } -- cgit v1.2.3 From face37f5e615646f364fa848f0a5c9d361d7a46e Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 15 Nov 2011 15:03:52 -0800 Subject: slab: add taint flag outputting to debug paths. When we get corruption reports, it's useful to see if the kernel was tainted, to rule out problems we can't do anything about. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Pekka Enberg --- mm/slab.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b0414d12fd08..a7f9c244aac6 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1941,8 +1941,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print header */ if (lines == 0) { printk(KERN_ERR - "Slab corruption: %s start=%p, len=%d\n", - cachep->name, realobj, size); + "Slab corruption (%s): %s start=%p, len=%d\n", + print_tainted(), cachep->name, realobj, size); print_objinfo(cachep, objp, 0); } /* Hexdump the affected line */ @@ -3051,8 +3051,9 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) if (entries != cachep->num - slabp->inuse) { bad: printk(KERN_ERR "slab: Internal list corruption detected in " - "cache '%s'(%d), slabp %p(%d). Hexdump:\n", - cachep->name, cachep->num, slabp, slabp->inuse); + "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse, + print_tainted()); print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), 1); -- cgit v1.2.3 From 4c493a5a5c0bab6c434af2723328edd79c49aa0c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Nov 2011 14:54:14 +0800 Subject: slub: add missed accounting With per-cpu partial list, slab is added to partial list first and then moved to node list. The __slab_free() code path for add/remove_partial is almost deprecated(except for slub debug). But we forget to account add/remove_partial when move per-cpu partial pages to node list, so the statistics for such events are always 0. Add corresponding accounting. This is against the patch "slub: use correct parameter to add a page to partial list tail" Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index c3138233a6e8..108ed03fb422 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s) } if (l != m) { - if (l == M_PARTIAL) + if (l == M_PARTIAL) { remove_partial(n, page); - else + stat(s, FREE_REMOVE_PARTIAL); + } else { add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } l = m; } -- cgit v1.2.3 From 73736e0387ba0e6d2b703407b4d26168d31516a7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Dec 2011 04:57:06 +0100 Subject: slub: fix a possible memleak in __slab_alloc() Zhihua Che reported a possible memleak in slub allocator on CONFIG_PREEMPT=y builds. It is possible current thread migrates right before disabling irqs in __slab_alloc(). We must check again c->freelist, and perform a normal allocation instead of scratching c->freelist. Many thanks to Zhihua Che for spotting this bug, introduced in 2.6.39 V2: Its also possible an IRQ freed one (or several) object(s) and populated c->freelist, so its not a CONFIG_PREEMPT only problem. Cc: [2.6.39+] Reported-by: Zhihua Che Signed-off-by: Eric Dumazet Acked-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 108ed03fb422..5e410a95abaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2169,6 +2169,11 @@ redo: goto new_slab; } + /* must check again c->freelist in case of cpu migration or IRQ */ + object = c->freelist; + if (object) + goto load_freelist; + stat(s, ALLOC_SLOWPATH); do { -- cgit v1.2.3 From 8f1e33daeda6cd89753f9e77d174805a6f21db09 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 23 Nov 2011 09:24:27 -0600 Subject: slub: Switch per cpu partial page support off for debugging Eric saw an issue with accounting of slabs during validation. Its not possible to determine accurately how many per cpu partial slabs exist at any time so this switches off per cpu partial pages during debug. Acked-by: Eric Dumazet Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ed3334d9b6da..4056d29e6610 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3028,7 +3028,9 @@ static int kmem_cache_open(struct kmem_cache *s, * per node list when we run out of per cpu objects. We only fetch 50% * to keep some capacity around for frees. */ - if (s->size >= PAGE_SIZE) + if (kmem_cache_debug(s)) + s->cpu_partial = 0; + else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; else if (s->size >= 1024) s->cpu_partial = 6; -- cgit v1.2.3 From 213eeb9fd9d66c33109e2ace242df214dc3a653d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 11 Nov 2011 14:07:14 -0600 Subject: slub: Extract get_freelist from __slab_alloc get_freelist retrieves free objects from the page freelist (put there by remote frees) or deactivates a slab page if no more objects are available. Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 57 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 5e410a95abaf..6dc79f8e6ce9 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2126,6 +2126,37 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, return object; } +/* + * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist + * or deactivate the page. + * + * The page is still frozen if the return value is not NULL. + * + * If this function returns NULL then the page has been unfrozen. + */ +static inline void *get_freelist(struct kmem_cache *s, struct page *page) +{ + struct page new; + unsigned long counters; + void *freelist; + + do { + freelist = page->freelist; + counters = page->counters; + new.counters = counters; + VM_BUG_ON(!new.frozen); + + new.inuse = page->objects; + new.frozen = freelist != NULL; + + } while (!cmpxchg_double_slab(s, page, + freelist, counters, + NULL, new.counters, + "get_freelist")); + + return freelist; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. @@ -2147,8 +2178,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void **object; unsigned long flags; - struct page new; - unsigned long counters; local_irq_save(flags); #ifdef CONFIG_PREEMPT @@ -2176,29 +2205,7 @@ redo: stat(s, ALLOC_SLOWPATH); - do { - object = c->page->freelist; - counters = c->page->counters; - new.counters = counters; - VM_BUG_ON(!new.frozen); - - /* - * If there is no object left then we use this loop to - * deactivate the slab which is simple since no objects - * are left in the slab and therefore we do not need to - * put the page back onto the partial list. - * - * If there are objects left then we retrieve them - * and use them to refill the per cpu queue. - */ - - new.inuse = c->page->objects; - new.frozen = object != NULL; - - } while (!__cmpxchg_double_slab(s, c->page, - object, counters, - NULL, new.counters, - "__slab_alloc")); + object = get_freelist(s, c->page); if (!object) { c->page = NULL; -- cgit v1.2.3 From b13683d1cc14d1dd30b8e20f3ebea3f814ad029f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 11 Nov 2011 14:54:14 +0800 Subject: slub: add missed accounting With per-cpu partial list, slab is added to partial list first and then moved to node list. The __slab_free() code path for add/remove_partial is almost deprecated(except for slub debug). But we forget to account add/remove_partial when move per-cpu partial pages to node list, so the statistics for such events are always 0. Add corresponding accounting. This is against the patch "slub: use correct parameter to add a page to partial list tail" Acked-by: Christoph Lameter Signed-off-by: Shaohua Li Signed-off-by: Pekka Enberg --- mm/slub.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4056d29e6610..8284a206f48d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1901,11 +1901,14 @@ static void unfreeze_partials(struct kmem_cache *s) } if (l != m) { - if (l == M_PARTIAL) + if (l == M_PARTIAL) { remove_partial(n, page); - else + stat(s, FREE_REMOVE_PARTIAL); + } else { add_partial(n, page, DEACTIVATE_TO_TAIL); + stat(s, FREE_ADD_PARTIAL); + } l = m; } -- cgit v1.2.3 From 74ee4ef1f901fbb014bdcdc9171d126490ce2b62 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Jan 2012 13:19:45 -0800 Subject: slub: disallow changing cpu_partial from userspace for debug caches For caches with debugging enabled, "slub: Switch per cpu partial page support off for debugging" changes cpu_partial to 0. It shouldn't be tunable from userspace for such caches, otherwise the same accounting issues arise during validation. This patch disallows tuning /sys/kernel/slab/cache/cpu_partial to be non- zero for caches with debugging enabled. Acked-by: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Pekka Enberg --- mm/slub.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 6dc79f8e6ce9..a47df0aa5d36 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4649,6 +4649,8 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = strict_strtoul(buf, 10, &objects); if (err) return err; + if (objects && kmem_cache_debug(s)) + return -EINVAL; s->cpu_partial = objects; flush_all(s); -- cgit v1.2.3