From 9e1b32caa525cb236e80e9c671e179bcecccc657 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 22 Jul 2009 15:44:28 +1000 Subject: mm: Pass virtual address to [__]p{te,ud,md}_free_tlb() mm: Pass virtual address to [__]p{te,ud,md}_free_tlb() Upcoming paches to support the new 64-bit "BookE" powerpc architecture will need to have the virtual address corresponding to PTE page when freeing it, due to the way the HW table walker works. Basically, the TLB can be loaded with "large" pages that cover the whole virtual space (well, sort-of, half of it actually) represented by a PTE page, and which contain an "indirect" bit indicating that this TLB entry RPN points to an array of PTEs from which the TLB can then create direct entries. Thus, in order to invalidate those when PTE pages are deleted, we need the virtual address to pass to tlbilx or tlbivax instructions. The old trick of sticking it somewhere in the PTE page struct page sucks too much, the address is almost readily available in all call sites and almost everybody implemets these as macros, so we may as well add the argument everywhere. I added it to the pmd and pud variants for consistency. Signed-off-by: Benjamin Herrenschmidt Acked-by: David Howells [MN10300 & FRV] Acked-by: Nick Piggin Acked-by: Martin Schwidefsky [s390] Signed-off-by: Linus Torvalds --- mm/memory.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 65216194eb8d..aede2ce3aba4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* -- cgit v1.2.3 From dddac6a7b445de95515f64fdf82fe5dc36c02f26 Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Wed, 29 Jul 2009 21:07:55 +0200 Subject: PM / Hibernate: Replace bdget call with simple atomic_inc of i_count Create bdgrab(). This function copies an existing reference to a block_device. It is safe to call from any context. Hibernation code wishes to copy a reference to the active swap device. Right now it calls bdget() under a spinlock, but this is wrong because bdget() can sleep. It doesn't need a full bdget() because we already hold a reference to active swap devices (and the spinlock protects against swapoff). Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13827 Signed-off-by: Alan Jenkins Signed-off-by: Rafael J. Wysocki --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index d1ade1a48ee7..8ffdc0d23c53 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); return i; @@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); -- cgit v1.2.3 From f5886c7f96f2542382d3a983c5f13e03d7fc5259 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 29 Jul 2009 16:26:57 +0100 Subject: kmemleak: Protect the seq start/next/stop sequence by rcu_read_lock() Objects passed to kmemleak_seq_next() have an incremented reference count (hence not freed) but they may point via object_list.next to other freed objects. To avoid this, the whole start/next/stop sequence must be protected by rcu_read_lock(). Signed-off-by: Catalin Marinas Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5aabd41ffb8f..487267310a84 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1217,7 +1217,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) } object = NULL; out: - rcu_read_unlock(); return object; } @@ -1233,13 +1232,11 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++(*pos); - rcu_read_lock(); list_for_each_continue_rcu(n, &object_list) { next_obj = list_entry(n, struct kmemleak_object, object_list); if (get_object(next_obj)) break; } - rcu_read_unlock(); put_object(prev_obj); return next_obj; @@ -1255,6 +1252,7 @@ static void kmemleak_seq_stop(struct seq_file *seq, void *v) * kmemleak_seq_start may return ERR_PTR if the scan_mutex * waiting was interrupted, so only release it if !IS_ERR. */ + rcu_read_unlock(); mutex_unlock(&scan_mutex); if (v) put_object(v); -- cgit v1.2.3 From e084b2d95e48b31aa45f9c49ffc6cdae8bdb21d4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:02:04 -0700 Subject: page-allocator: preserve PFN ordering when __GFP_COLD is set Fix a post-2.6.24 performace regression caused by 3dfa5721f12c3d5a441448086bee156887daa961 ("page-allocator: preserve PFN ordering when __GFP_COLD is set"). Narayanan reports "The regression is around 15%. There is no disk controller as our setup is based on Samsung OneNAND used as a memory mapped device on a OMAP2430 based board." The page allocator tries to preserve contiguous PFN ordering when returning pages such that repeated callers to the allocator have a strong chance of getting physically contiguous pages, particularly when external fragmentation is low. However, of the bulk of the allocations have __GFP_COLD set as they are due to aio_read() for example, then the PFNs are in reverse PFN order. This can cause performance degration when used with IO controllers that could have merged the requests. This patch attempts to preserve the contiguous ordering of PFNs for users of __GFP_COLD. Signed-off-by: Mel Gorman Reported-by: Narayananu Gopalakrishnan Tested-by: Narayanan Gopalakrishnan Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caa92689aac9..ae28c22a7fdb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -882,7 +882,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -901,7 +901,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -1119,7 +1122,8 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); if (unlikely(!pcp->count)) goto failed; } @@ -1138,7 +1142,8 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); page = list_entry(pcp->list.next, struct page, lru); } -- cgit v1.2.3 From 6583bb64fc370842b32a87c67750c26f6d559af0 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 29 Jul 2009 15:02:06 -0700 Subject: mm: avoid endless looping for oom killed tasks If a task is oom killed and still cannot find memory when trying with no watermarks, it's better to fail the allocation attempt than to loop endlessly. Direct reclaim has already failed and the oom killer will be a no-op since current has yet to die, so there is no other alternative for allocations that are not __GFP_NOFAIL. Acked-by: Mel Gorman Signed-off-by: David Rientjes Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ae28c22a7fdb..2dbb2fc68837 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1794,6 +1794,10 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, -- cgit v1.2.3 From e4c6f8bed01f9f9a5c607bd689bf67e7b8a36bd8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 29 Jul 2009 15:02:16 -0700 Subject: hugetlbfs: fix i_blocks accounting As reported in Red Hat bz #509671, i_blocks for files on hugetlbfs get accounting wrong when doing something like: $ > foo $ date > foo date: write error: Invalid argument $ /usr/bin/stat foo File: `foo' Size: 0 Blocks: 18446744073709547520 IO Block: 2097152 regular ... This is because hugetlb_unreserve_pages() is unconditionally removing blocks_per_huge_page(h) on each call rather than using the freed amount. If there were 0 blocks, it goes negative, resulting in the above. This is a regression from commit a5516438959d90b071ff0a484ce4f3f523dc3152 ("hugetlb: modular state for hugetlb page size") which did: - inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed; + inode->i_blocks -= blocks_per_huge_page(h); so just put back the freed multiplier, and it's all happy again. Signed-off-by: Eric Sandeen Acked-by: Andi Kleen Cc: William Lee Irwin III Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d0351e31f474..cafdcee154e8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2370,7 +2370,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); -- cgit v1.2.3 From 887032670d47366a8c8f25396ea7c14b7b2cc620 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Jul 2009 15:04:06 -0700 Subject: cgroup avoid permanent sleep at rmdir After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Reviewed-by: Paul Menage Acked-by: Balbir Sigh Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e717964cb5a0..fd4529d86de5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* -- cgit v1.2.3 From 1fc28b70fe2dbf87e061b6ce5091a1f8e4e5d4e7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 29 Jul 2009 15:04:08 -0700 Subject: page-allocator: allow too high-order warning messages to be suppressed with __GFP_NOWARN The page allocator warns once when an order >= MAX_ORDER is specified. This is to catch callers of the allocator that are always falling back to their worst-case when it was not expected. However, there are cases where the caller is behaving correctly but cannot suppress the warning. This patch allows the warning to be suppressed by the callers by specifying __GFP_NOWARN. Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Arnaldo Carvalho de Melo Cc: "David S. Miller" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2dbb2fc68837..d052abbe3063 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1745,8 +1745,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * be using allocators in order of preference for an area that is * too large. */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and -- cgit v1.2.3 From 4bfc44958e499af9a73f62201543b3a1f617cfeb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 6 Aug 2009 15:07:33 -0700 Subject: mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Cc: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 84 ++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63a..7dd9d9f80694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } if (err) { - up_write(&mm->mmap_sem); mpol_put(new); return err; } @@ -1891,6 +1911,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) { mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ } task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { + NODEMASK_SCRATCH_FREE(scratch); mpol_put(new); return; } @@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) err = 1; else { int ret; - - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { err = 1; - else if (no_context) { + mpol_put(new); + } else if (no_context) { /* save for contextualization */ new->w.user_nodemask = nodes; } -- cgit v1.2.3 From 5e2f89b5d5d87a7c3ba19fc85ba0c29adb65f639 Mon Sep 17 00:00:00 2001 From: "Figo.zhang" Date: Sat, 8 Aug 2009 21:01:22 +0800 Subject: mempool.c: clean up type-casting clean up type-casting twice. "size_t" is typedef as "unsigned long" in 64-bit system, and "unsigned int" in 32-bit system, and the intermediate cast to 'long' is pointless. Signed-off-by: Figo.zhang Signed-off-by: Linus Torvalds --- mm/mempool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b4bb66..32e75d400503 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,14 +303,14 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t) pool_data; + size_t size = (size_t)pool_data; return kzalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kzalloc); -- cgit v1.2.3 From 74d46d6b2d23d44d72c37df4c6a5d2e782f7b088 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Jul 2009 17:11:50 +0900 Subject: percpu, sparc64: fix sparse possible cpu map handling percpu code has been assuming num_possible_cpus() == nr_cpu_ids which is incorrect if cpu_possible_map contains holes. This causes percpu code to access beyond allocated memories and vmalloc areas. On a sparc64 machine with cpus 0 and 2 (u60), this triggers the following warning or fails boot. WARNING: at /devel/tj/os/work/mm/vmalloc.c:106 vmap_page_range_noflush+0x1f0/0x240() Modules linked in: Call Trace: [00000000004b17d0] vmap_page_range_noflush+0x1f0/0x240 [00000000004b1840] map_vm_area+0x20/0x60 [00000000004b1950] __vmalloc_area_node+0xd0/0x160 [0000000000593434] deflate_init+0x14/0xe0 [0000000000583b94] __crypto_alloc_tfm+0xd4/0x1e0 [00000000005844f0] crypto_alloc_base+0x50/0xa0 [000000000058b898] alg_test_comp+0x18/0x80 [000000000058dad4] alg_test+0x54/0x180 [000000000058af00] cryptomgr_test+0x40/0x60 [0000000000473098] kthread+0x58/0x80 [000000000042b590] kernel_thread+0x30/0x60 [0000000000472fd0] kthreadd+0xf0/0x160 ---[ end trace 429b268a213317ba ]--- This patch fixes generic percpu functions and sparc64 setup_per_cpu_areas() so that they handle sparse cpu_possible_map properly. Please note that on x86, cpu_possible_map() doesn't contain holes and thus num_possible_cpus() == nr_cpu_ids and this patch doesn't cause any behavior difference. Signed-off-by: Tejun Heo Acked-by: David S. Miller Cc: Ingo Molnar --- mm/percpu.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index b70f2acd8853..e0be1146f617 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -8,12 +8,12 @@ * * This is percpu allocator which can handle both static and dynamic * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * chunk is consisted of nr_cpu_ids units and the first chunk is used + * for static percpu variables in the kernel image (special boot time + * alloc/init handling necessary as these areas need to be brought up + * before allocation services are running). Unit grows as necessary + * and all units grow or shrink in unison. When a chunk is filled up, + * another chunk is allocated. ie. in vmalloc area * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -558,7 +558,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, bool flush_tlb) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; /* unmap must not be done on immutable chunk */ @@ -643,7 +643,7 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, */ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) { - unsigned int last = num_possible_cpus() - 1; + unsigned int last = nr_cpu_ids - 1; unsigned int cpu; int err; @@ -1067,9 +1067,9 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, PFN_UP(size_sum)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_chunk_size = nr_cpu_ids * pcpu_unit_size; pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + + nr_cpu_ids * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) dyn_size = pcpu_unit_size - static_size - reserved_size; @@ -1248,7 +1248,7 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } else pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); - chunk_size = pcpue_unit_size * num_possible_cpus(); + chunk_size = pcpue_unit_size * nr_cpu_ids; pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); @@ -1259,12 +1259,15 @@ ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, } /* return the leftover and copy */ - for_each_possible_cpu(cpu) { + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { void *ptr = pcpue_ptr + cpu * pcpue_unit_size; - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + if (cpu_possible(cpu)) { + free_bootmem(__pa(ptr + pcpue_size), + pcpue_unit_size - pcpue_size); + memcpy(ptr, __per_cpu_load, static_size); + } else + free_bootmem(__pa(ptr), pcpue_unit_size); } /* we're ready, commit */ -- cgit v1.2.3 From 142d44b0dd6741a64a7bdbe029110e7c1dcf1d23 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Thu, 13 Aug 2009 02:00:13 -0400 Subject: percpu: use the right flag for get_vm_area() get_vm_area() only accepts VM_* flags, not GFP_*. And according to the doc of get_vm_area(), here should be VM_ALLOC. Signed-off-by: WANG Cong Acked-by: Tejun Heo Cc: Ingo Molnar --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index e0be1146f617..5fe37842e0ea 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -749,7 +749,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) chunk->map[chunk->map_used++] = pcpu_unit_size; chunk->page = chunk->page_ar; - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC); if (!chunk->vm) { free_pcpu_chunk(chunk); return NULL; -- cgit v1.2.3 From 788084aba2ab7348257597496befcbccabdc98a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- mm/Kconfig | 6 +++--- mm/mmap.c | 3 --- mm/nommu.c | 3 --- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..fe5f674d7a7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd5..8101de490c73 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece4..28754c40be98 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); -- cgit v1.2.3 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 64 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 39 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..a7b2460e922b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } -- cgit v1.2.3 From 28d7a6ae92c099d81cbea08c20be0d2cf7ccd7ca Mon Sep 17 00:00:00 2001 From: Graff Yang Date: Tue, 18 Aug 2009 14:11:17 -0700 Subject: nommu: check fd read permission in validate_mmap_request() According to the POSIX (1003.1-2008), the file descriptor shall have been opened with read permission, regardless of the protection options specified to mmap(). The ltp test cases mmap06/07 need this. Signed-off-by: Graff Yang Acked-by: Paul Mundt Signed-off-by: David Howells Acked-by: Greg Ungerer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nommu.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/nommu.c b/mm/nommu.c index 28754c40be98..4bde489ec431 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -919,6 +919,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && -- cgit v1.2.3 From 7f9cfb31030737a7fc9a1cbca3fd01bec184c849 Mon Sep 17 00:00:00 2001 From: Bo Liu Date: Tue, 18 Aug 2009 14:11:19 -0700 Subject: mm: build_zonelists(): move clear node_load[] to __build_all_zonelists() If node_load[] is cleared everytime build_zonelists() is called,node_load[] will have no help to find the next node that should appear in the given node's fallback list. Because of the bug, zonelist's node_order is not calculated as expected. This bug affects on big machine, which has asynmetric node distance. [synmetric NUMA's node distance] 0 1 2 0 10 12 12 1 12 10 12 2 12 12 10 [asynmetric NUMA's node distance] 0 1 2 0 10 12 20 1 12 10 14 2 20 14 10 This (my bug) is very old but no one has reported this for a long time. Maybe because the number of asynmetric NUMA is very small and they use cpuset for customizing node memory allocation fallback. [akpm@linux-foundation.org: fix CONFIG_NUMA=n build] Signed-off-by: Bo Liu Reviewed-by: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Christoph Lameter Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d052abbe3063..5cc986eb9f6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2544,7 +2544,6 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2653,6 +2652,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); -- cgit v1.2.3 From 03ef83af528899aa339e42d8024b37e2f434fba4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 26 Aug 2009 14:29:23 -0700 Subject: mm: fix for infinite churning of mlocked pages An mlocked page might lose the isolatation race. This causes the page to clear PG_mlocked while it remains in a VM_LOCKED vma. This means it can be put onto the [in]active list. We can rescue it by using try_to_unmap() in shrink_page_list(). But now, As Wu Fengguang pointed out, vmscan has a bug. If the page has PG_referenced, it can't reach try_to_unmap() in shrink_page_list() but is put into the active list. If the page is referenced repeatedly, it can remain on the [in]active list without being moving to the unevictable list. This patch fixes it. Reported-by: Wu Fengguang Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro < Cc: Lee Schermerhorn Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 1 + mm/vmscan.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 836c6c63e1f2..0895b5c7cbff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -358,6 +358,7 @@ static int page_referenced_one(struct page *page, */ if (vma->vm_flags & VM_LOCKED) { *mapcount = 1; /* break early from loop */ + *vm_flags |= VM_LOCKED; goto out_unmap; } diff --git a/mm/vmscan.c b/mm/vmscan.c index dea7abd31098..94e86dd6954c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -630,9 +630,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); - /* In active use or really unfreeable? Activate it. */ + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* -- cgit v1.2.3