[PATCH] slab: better fallback allocation behavior

Currently we simply attempt to allocate from all allowed nodes using GFP_THISNODE. However, GFP_THISNODE does not do reclaim (it wont do any at all if the recent GFP_THISNODE patch is accepted). If we truly run out of memory in the whole system then fallback_alloc may return NULL although memory may still be available if we would perform more thorough reclaim. This patch changes fallback_alloc() so that we first only inspect all the per node queues for available slabs. If we find any then we allocate from those. This avoids slab fragmentation by first getting rid of all partial allocated slabs on every node before allocating new memory. If we cannot satisfy the allocation from any per node queue then we extend a slab. We now call into the page allocator without specifying GFP_THISNODE. The page allocator will then implement its own fallback (in the given cpuset context), perform necessary reclaim (again considering not a single node but the whole set of allowed nodes) and then return pages for a new slab. We identify from which node the pages were allocated and then insert the pages into the corresponding per node structure. In order to do so we need to modify cache_grow() to take a parameter that specifies the new slab. kmem_getpages() can no longer set the GFP_THISNODE flag since we need to be able to use kmem_getpage to allocate from an arbitrary node. GFP_THISNODE needs to be specified when calling cache_grow(). One key advantage is that the decision from which node to allocate new memory is removed from slab fallback processing. The patch allows to go back to use of the page allocators fallback/reclaim logic. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@sgi.com> 2006-12-06 20:33:29 -0800
committer: Linus Torvalds <torvalds@woody.osdl.org> 2006-12-07 08:39:25 -0800
commit: 3c517a6132098ca37e122a2980fc64a9e798b0d7 (patch)
tree: e8fa49ef3f873624c0f5d29b34fdc8684988b426 /mm
parent: 952f3b51beb592f3f1de15adcdef802fc086ea91 (diff)
download: linux-3c517a6132098ca37e122a2980fc64a9e798b0d7.tar.gz
linux-3c517a6132098ca37e122a2980fc64a9e798b0d7.tar.bz2
linux-3c517a6132098ca37e122a2980fc64a9e798b0d7.zip
1 files changed, 57 insertions, 22 deletions
diff --git a/mm/slab.c b/mm/slab.c
index 6da554fd3f6a..7b8e5d668586 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1605,12 +1605,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	flags |= __GFP_COMP;
 #endif
 
-	/*
-	 * Under NUMA we want memory on the indicated node. We will handle
-	 * the needed fallback ourselves since we want to serve from our
-	 * per node object lists first for other nodes.
-	 */
-	flags |= cachep->gfpflags | GFP_THISNODE;
+	flags |= cachep->gfpflags;
 
 	page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 	if (!page)
@@ -2567,7 +2562,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	if (OFF_SLAB(cachep)) {
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
-					      local_flags, nodeid);
+					      local_flags & ~GFP_THISNODE, nodeid);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -2708,10 +2703,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static int cache_grow(struct kmem_cache *cachep,
+		gfp_t flags, int nodeid, void *objp)
 {
 	struct slab *slabp;
-	void *objp;
 	size_t offset;
 	gfp_t local_flags;
 	unsigned long ctor_flags;
@@ -2763,12 +2758,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 * Get mem for the objs.  Attempt to allocate a physical page from
 	 * 'nodeid'.
 	 */
-	objp = kmem_getpages(cachep, flags, nodeid);
+	if (!objp)
+		objp = kmem_getpages(cachep, flags, nodeid);
 	if (!objp)
 		goto failed;
 
 	/* Get slab management. */
-	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid);
+	slabp = alloc_slabmgmt(cachep, objp, offset,
+			local_flags & ~GFP_THISNODE, nodeid);
 	if (!slabp)
 		goto opps1;
 
@@ -3006,7 +3003,7 @@ alloc_done:
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, node);
+		x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
@@ -3166,9 +3163,11 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 
 /*
  * Fallback function if there was no memory available and no objects on a
- * certain node and we are allowed to fall back. We mimick the behavior of
- * the page allocator. We fall back according to a zonelist determined by
- * the policy layer while obeying cpuset constraints.
+ * certain node and fall back is permitted. First we scan all the
+ * available nodelists for available objects. If that fails then we
+ * perform an allocation without specifying a node. This allows the page
+ * allocator to do its reclaim / fallback magic. We then insert the
+ * slab into the proper nodelist and then allocate from it.
  */
 void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
@@ -3176,15 +3175,51 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 					->node_zonelists[gfp_zone(flags)];
 	struct zone **z;
 	void *obj = NULL;
+	int nid;
 
+retry:
+	/*
+	 * Look through allowed nodes for objects available
+	 * from existing per node queues.
+	 */
 	for (z = zonelist->zones; *z && !obj; z++) {
-		int nid = zone_to_nid(*z);
+		nid = zone_to_nid(*z);
+
+		if (cpuset_zone_allowed(*z, flags) &&
+			cache->nodelists[nid] &&
+			cache->nodelists[nid]->free_objects)
+				obj = ____cache_alloc_node(cache,
+					flags | GFP_THISNODE, nid);
+	}
 
-		if (zone_idx(*z) <= ZONE_NORMAL &&
-				cpuset_zone_allowed(*z, flags) &&
-				cache->nodelists[nid])
-			obj = ____cache_alloc_node(cache,
-					flags | __GFP_THISNODE, nid);
+	if (!obj) {
+		/*
+		 * This allocation will be performed within the constraints
+		 * of the current cpuset / memory policy requirements.
+		 * We may trigger various forms of reclaim on the allowed
+		 * set and go into memory reserves if necessary.
+		 */
+		obj = kmem_getpages(cache, flags, -1);
+		if (obj) {
+			/*
+			 * Insert into the appropriate per node queues
+			 */
+			nid = page_to_nid(virt_to_page(obj));
+			if (cache_grow(cache, flags, nid, obj)) {
+				obj = ____cache_alloc_node(cache,
+					flags | GFP_THISNODE, nid);
+				if (!obj)
+					/*
+					 * Another processor may allocate the
+					 * objects in the slab since we are
+					 * not holding any locks.
+					 */
+					goto retry;
+			} else {
+				kmem_freepages(cache, obj);
+				obj = NULL;
+			}
+		}
 	}
 	return obj;
 }
@@ -3241,7 +3276,7 @@ retry:
 
 must_grow:
 	spin_unlock(&l3->list_lock);
-	x = cache_grow(cachep, flags, nodeid);
+	x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
 	if (x)
 		goto retry;
author	Christoph Lameter <clameter@sgi.com>	2006-12-06 20:33:29 -0800
committer	Linus Torvalds <torvalds@woody.osdl.org>	2006-12-07 08:39:25 -0800
commit	3c517a6132098ca37e122a2980fc64a9e798b0d7 (patch)
tree	e8fa49ef3f873624c0f5d29b34fdc8684988b426 /mm
parent	952f3b51beb592f3f1de15adcdef802fc086ea91 (diff)
download	linux-3c517a6132098ca37e122a2980fc64a9e798b0d7.tar.gz linux-3c517a6132098ca37e122a2980fc64a9e798b0d7.tar.bz2 linux-3c517a6132098ca37e122a2980fc64a9e798b0d7.zip