From 47d38344abd0c7c6793b59ac741aa5b205fc197c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:41:54 -0700 Subject: hugetlb: rename max_hstate to hugetlb_max_hstate This patchset implements a cgroup resource controller for HugeTLB pages. The controller allows to limit the HugeTLB usage per control group and enforces the controller limit during page fault. Since HugeTLB doesn't support page reclaim, enforcing the limit at page fault time implies that, the application will get SIGBUS signal if it tries to access HugeTLB pages beyond its limit. This requires the application to know beforehand how much HugeTLB pages it would require for its use. The goal is to control how many HugeTLB pages a group of task can allocate. It can be looked at as an extension of the existing quota interface which limits the number of HugeTLB pages per hugetlbfs superblock. HPC job scheduler requires jobs to specify their resource requirements in the job file. Once their requirements can be met, job schedulers like (SLURM) will schedule the job. We need to make sure that the jobs won't consume more resources than requested. If they do we should either error out or kill the application. This patch: Rename max_hstate to hugetlb_max_hstate. We will be using this from other subsystems like hugetlb controller in later patches. Signed-off-by: Aneesh Kumar K.V Acked-by: David Rientjes Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831276a3..c86830931cc6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,7 +34,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int max_hstate; +static int hugetlb_max_hstate; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -46,7 +46,7 @@ static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; #define for_each_hstate(h) \ - for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) + for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -1897,9 +1897,9 @@ void __init hugetlb_add_hstate(unsigned order) printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); return; } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); - h = &hstates[max_hstate++]; + h = &hstates[hugetlb_max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); h->nr_huge_pages = 0; @@ -1920,10 +1920,10 @@ static int __init hugetlb_nrpages_setup(char *s) static unsigned long *last_mhp; /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!max_hstate) + if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; @@ -1942,7 +1942,7 @@ static int __init hugetlb_nrpages_setup(char *s) * But we need to allocate >= MAX_ORDER hstates here early to still * use the bootmem allocator. */ - if (max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; -- cgit v1.2.3 From 76dcee75c1aff61259f5ed55e2bcfab60cc4bd5f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:41:57 -0700 Subject: hugetlb: don't use ERR_PTR with VM_FAULT* values The current use of VM_FAULT_* codes with ERR_PTR requires us to ensure VM_FAULT_* values will not exceed MAX_ERRNO value. Decouple the VM_FAULT_* values from MAX_ERRNO. Signed-off-by: Aneesh Kumar K.V Acked-by: Hillf Danton Acked-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c86830931cc6..34a7e2375478 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1123,10 +1123,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-ENOMEM); if (chg) if (hugepage_subpool_get_pages(spool, chg)) - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOSPC); spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); @@ -1136,7 +1136,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugepage_subpool_put_pages(spool, chg); - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOSPC); } } @@ -2496,6 +2496,7 @@ retry_avoidcopy: new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { + long err = PTR_ERR(new_page); page_cache_release(old_page); /* @@ -2524,7 +2525,10 @@ retry_avoidcopy: /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); - return -PTR_ERR(new_page); + if (err == -ENOMEM) + return VM_FAULT_OOM; + else + return VM_FAULT_SIGBUS; } /* @@ -2642,7 +2646,11 @@ retry: goto out; page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { - ret = -PTR_ERR(page); + ret = PTR_ERR(page); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); -- cgit v1.2.3 From 972dc4de13f667a7df27ee32573b2e6fc6cc8434 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:00 -0700 Subject: hugetlb: add an inline helper for finding hstate index Add an inline helper and use it in the code. Signed-off-by: Aneesh Kumar K.V Acked-by: David Rientjes Acked-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34a7e2375478..b1e0ed1ea912 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1646,7 +1646,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct attribute_group *hstate_attr_group) { int retval; - int hi = h - hstates; + int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) @@ -1741,11 +1741,13 @@ void hugetlb_unregister_node(struct node *node) if (!nhs->hugepages_kobj) return; /* no hstate attributes */ - for_each_hstate(h) - if (nhs->hstate_kobjs[h - hstates]) { - kobject_put(nhs->hstate_kobjs[h - hstates]); - nhs->hstate_kobjs[h - hstates] = NULL; + for_each_hstate(h) { + int idx = hstate_index(h); + if (nhs->hstate_kobjs[idx]) { + kobject_put(nhs->hstate_kobjs[idx]); + nhs->hstate_kobjs[idx] = NULL; } + } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; @@ -1848,7 +1850,7 @@ static void __exit hugetlb_exit(void) hugetlb_unregister_all_nodes(); for_each_hstate(h) { - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hstate_index(h)]); } kobject_put(hugepages_kobj); @@ -1869,7 +1871,7 @@ static int __init hugetlb_init(void) if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } - default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); if (default_hstate_max_huge_pages) default_hstate.max_huge_pages = default_hstate_max_huge_pages; @@ -2687,7 +2689,7 @@ retry: */ if (unlikely(PageHWPoison(page))) { ret = VM_FAULT_HWPOISON | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } } @@ -2760,7 +2762,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); -- cgit v1.2.3 From 24669e58477e2752c1fbca9c1c988e9dd0d79d15 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:03 -0700 Subject: hugetlb: use mmu_gather instead of a temporary linked list for accumulating pages Use a mmu_gather instead of a temporary linked list for accumulating pages when we unmap a hugepage range Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 59 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 24 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1e0ed1ea912..e54b695336f9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,8 +24,9 @@ #include #include -#include +#include +#include #include #include #include "internal.h" @@ -2310,30 +2311,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) return 0; } -void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) { + int force_flush = 0; struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; struct page *page; - struct page *tmp; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - /* - * A page gathering list, protected by per file i_mmap_mutex. The - * lock is used to avoid list corruption from multiple unmapping - * of the same page since we are using page->lru. - */ - LIST_HEAD(page_list); - WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, start, end); +again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -2372,30 +2369,45 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } pte = huge_ptep_get_and_clear(mm, address, ptep); + tlb_remove_tlb_entry(tlb, ptep, address); if (pte_dirty(pte)) set_page_dirty(page); - list_add(&page->lru, &page_list); + page_remove_rmap(page); + force_flush = !__tlb_remove_page(tlb, page); + if (force_flush) + break; /* Bail out after unmapping reference page if supplied */ if (ref_page) break; } - flush_tlb_range(vma, start, end); spin_unlock(&mm->page_table_lock); - mmu_notifier_invalidate_range_end(mm, start, end); - list_for_each_entry_safe(page, tmp, &page_list, lru) { - page_remove_rmap(page); - list_del(&page->lru); - put_page(page); + /* + * mmu_gather ran out of room to batch pages, we break out of + * the PTE lock to avoid doing the potential expensive TLB invalidate + * and page-free while holding it. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + if (address < end && !ref_page) + goto again; } + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_end_vma(tlb, vma); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - __unmap_hugepage_range(vma, start, end, ref_page); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + struct mm_struct *mm; + struct mmu_gather tlb; + + mm = vma->vm_mm; + + tlb_gather_mmu(&tlb, mm, 0); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + tlb_finish_mmu(&tlb, start, end); } /* @@ -2440,9 +2452,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - __unmap_hugepage_range(iter_vma, - address, address + huge_page_size(h), - page); + unmap_hugepage_range(iter_vma, address, + address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); -- cgit v1.2.3 From 0edaecfab218d747d30de4575e911907371e2cd2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:07 -0700 Subject: hugetlb: add a list for tracking in-use HugeTLB pages hugepage_activelist will be used to track currently used HugeTLB pages. We need to find the in-use HugeTLB pages to support HugeTLB cgroup removal. On cgroup removal we update the page's HugeTLB cgroup to point to parent cgroup. Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e54b695336f9..b5b6e156ca76 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -510,7 +510,7 @@ void copy_huge_page(struct page *dst, struct page *src) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &h->hugepage_freelists[nid]); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; } @@ -522,7 +522,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) if (list_empty(&h->hugepage_freelists[nid])) return NULL; page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); - list_del(&page->lru); + list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -626,10 +626,11 @@ static void free_huge_page(struct page *page) page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); - INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + /* remove the page from active list */ + list_del(&page->lru); update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -642,6 +643,7 @@ static void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { + INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); h->nr_huge_pages++; @@ -890,6 +892,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (page) { + INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* @@ -994,7 +997,6 @@ retry: list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; - list_del(&page->lru); /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. @@ -1009,7 +1011,6 @@ free: /* Free unnecessary surplus pages to the buddy allocator */ if (!list_empty(&surplus_list)) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - list_del(&page->lru); put_page(page); } } @@ -1909,6 +1910,7 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_activelist); h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", -- cgit v1.2.3 From c3f38a38715e9b4e7b1dda6840a1375f6894744d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:10 -0700 Subject: hugetlb: make some static variables global We will use them later in hugetlb_cgroup.c Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b5b6e156ca76..d5971597736b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int hugetlb_max_hstate; +int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -46,13 +46,10 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; -#define for_each_hstate(h) \ - for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) - /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ -static DEFINE_SPINLOCK(hugetlb_lock); +DEFINE_SPINLOCK(hugetlb_lock); static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { -- cgit v1.2.3 From 9dd540e23111d8884773ab942a736f3aba4040d4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:15 -0700 Subject: hugetlb/cgroup: add the cgroup pointer to page lru Add the hugetlb cgroup pointer to 3rd page lru.next. This limit the usage to hugetlb cgroup to only hugepages with 3 or more normal pages. I guess that is an acceptable limitation. Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5971597736b..efe29b53daff 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -28,6 +28,7 @@ #include #include +#include #include #include "internal.h" @@ -591,6 +592,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1 << PG_writeback); } + VM_BUG_ON(hugetlb_cgroup_from_page(page)); set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); @@ -643,6 +645,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); + set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); @@ -892,6 +895,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); + set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already */ -- cgit v1.2.3 From 6d76dcf40405144a448040a350fd214ddc243d5e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:18 -0700 Subject: hugetlb/cgroup: add charge/uncharge routines for hugetlb cgroup Add the charge and uncharge routines for hugetlb cgroup. We do cgroup charging in page alloc and uncharge in compound page destructor. Assigning page's hugetlb cgroup is protected by hugetlb_lock. [liwp@linux.vnet.ibm.com: add huge_page_order check to avoid incorrect uncharge] Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Aneesh Kumar K.V Signed-off-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index efe29b53daff..16a0f32c4820 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -627,6 +627,8 @@ static void free_huge_page(struct page *page) BUG_ON(page_mapcount(page)); spin_lock(&hugetlb_lock); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { /* remove the page from active list */ list_del(&page->lru); @@ -1115,7 +1117,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct page *page; long chg; + int ret, idx; + struct hugetlb_cgroup *h_cg; + idx = hstate_index(h); /* * Processes that did not create the mapping will have no * reserves and will not have accounted against subpool @@ -1131,6 +1136,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (hugepage_subpool_get_pages(spool, chg)) return ERR_PTR(-ENOSPC); + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); + if (ret) { + hugepage_subpool_put_pages(spool, chg); + return ERR_PTR(-ENOSPC); + } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); @@ -1138,6 +1148,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { + hugetlb_cgroup_uncharge_cgroup(idx, + pages_per_huge_page(h), + h_cg); hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-ENOSPC); } @@ -1146,7 +1159,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); - + /* update page cgroup details */ + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); return page; } -- cgit v1.2.3 From abb8206cb07734d0b7bf033c715995d6371a94c3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:24 -0700 Subject: hugetlb/cgroup: add hugetlb cgroup control files Add the control files for hugetlb controller [akpm@linux-foundation.org: s/CONFIG_CGROUP_HUGETLB_RES_CTLR/CONFIG_MEMCG_HUGETLB/g] [akpm@linux-foundation.org: s/CONFIG_MEMCG_HUGETLB/CONFIG_CGROUP_HUGETLB/] Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 16a0f32c4820..c57740bb203a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -1930,6 +1931,13 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgoup details. + */ + if (order >= HUGETLB_CGROUP_MIN_ORDER) + hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } -- cgit v1.2.3 From 79dbb2368ae3515fad9c8b7c8f831cd86be59b1d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:32 -0700 Subject: hugetlb: move all the in use pages to active list When we fail to allocate pages from the reserve pool, hugetlb tries to allocate huge pages using alloc_buddy_huge_page. Add these to the active list. We also need to add the huge page we allocate when we soft offline the oldpage to active list. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c57740bb203a..ec7b86ebf9d9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -928,8 +928,14 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); - if (!page) + if (!page) { page = alloc_buddy_huge_page(h, nid); + if (page) { + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); + } + } return page; } @@ -1155,6 +1161,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-ENOSPC); } + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); } set_page_private(page, (unsigned long)spool); -- cgit v1.2.3 From 94ae8ba7176666d1e7d8bbb9f93670a27540b6a8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:35 -0700 Subject: hugetlb/cgroup: assign the page hugetlb cgroup when we move the page to active list. A page's hugetlb cgroup assignment and movement to the active list should occur with hugetlb_lock held. Otherwise when we remove the hugetlb cgroup we will iterate the active list and find pages with NULL hugetlb cgroup values. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec7b86ebf9d9..c39e4beeb63a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -928,14 +928,8 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); - if (!page) { + if (!page) page = alloc_buddy_huge_page(h, nid); - if (page) { - spin_lock(&hugetlb_lock); - list_move(&page->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); - } - } return page; } @@ -1150,9 +1144,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - spin_unlock(&hugetlb_lock); - - if (!page) { + if (page) { + /* update page cgroup details */ + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), + h_cg, page); + spin_unlock(&hugetlb_lock); + } else { + spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_cgroup_uncharge_cgroup(idx, @@ -1162,6 +1160,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), + h_cg, page); list_move(&page->lru, &h->hugepage_activelist); spin_unlock(&hugetlb_lock); } @@ -1169,8 +1169,6 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); - /* update page cgroup details */ - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); return page; } -- cgit v1.2.3 From d833352a4338dc31295ed832a30c9ccff5c7a183 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:46:20 -0700 Subject: mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables If a process creates a large hugetlbfs mapping that is eligible for page table sharing and forks heavily with children some of whom fault and others which destroy the mapping then it is possible for page tables to get corrupted. Some teardowns of the mapping encounter a "bad pmd" and output a message to the kernel log. The final teardown will trigger a BUG_ON in mm/filemap.c. This was reproduced in 3.4 but is known to have existed for a long time and goes back at least as far as 2.6.37. It was probably was introduced in 2.6.20 by [39dde65c: shared page table for hugetlb page]. The messages look like this; [ ..........] Lots of bad pmd messages followed by this [ 127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7). [ 127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7). [ 127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7). [ 127.186778] ------------[ cut here ]------------ [ 127.186781] kernel BUG at mm/filemap.c:134! [ 127.186782] invalid opcode: 0000 [#1] SMP [ 127.186783] CPU 7 [ 127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod [ 127.186801] [ 127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR [ 127.186804] RIP: 0010:[] [] __delete_from_page_cache+0x15e/0x160 [ 127.186809] RSP: 0000:ffff8804144b5c08 EFLAGS: 00010002 [ 127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0 [ 127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00 [ 127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003 [ 127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8 [ 127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8 [ 127.186815] FS: 00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000 [ 127.186816] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0 [ 127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0) [ 127.186821] Stack: [ 127.186822] ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b [ 127.186824] ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98 [ 127.186825] ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000 [ 127.186827] Call Trace: [ 127.186829] [] delete_from_page_cache+0x3b/0x80 [ 127.186832] [] truncate_hugepages+0x115/0x220 [ 127.186834] [] hugetlbfs_evict_inode+0x13/0x30 [ 127.186837] [] evict+0xa7/0x1b0 [ 127.186839] [] iput_final+0xd3/0x1f0 [ 127.186840] [] iput+0x39/0x50 [ 127.186842] [] d_kill+0xf8/0x130 [ 127.186843] [] dput+0xd2/0x1a0 [ 127.186845] [] __fput+0x170/0x230 [ 127.186848] [] ? rb_erase+0xce/0x150 [ 127.186849] [] fput+0x1d/0x30 [ 127.186851] [] remove_vma+0x37/0x80 [ 127.186853] [] do_munmap+0x2d2/0x360 [ 127.186855] [] sys_shmdt+0xc9/0x170 [ 127.186857] [] system_call_fastpath+0x16/0x1b [ 127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0 [ 127.186868] RIP [] __delete_from_page_cache+0x15e/0x160 [ 127.186870] RSP [ 127.186871] ---[ end trace 7cbac5d1db69f426 ]--- The bug is a race and not always easy to reproduce. To reproduce it I was doing the following on a single socket I7-based machine with 16G of RAM. $ hugeadm --pool-pages-max DEFAULT:13G $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall $ for i in `seq 1 9000`; do ./hugetlbfs-test; done On my particular machine, it usually triggers within 10 minutes but enabling debug options can change the timing such that it never hits. Once the bug is triggered, the machine is in trouble and needs to be rebooted. The machine will respond but processes accessing proc like "ps aux" will hang due to the BUG_ON. shutdown will also hang and needs a hard reset or a sysrq-b. The basic problem is a race between page table sharing and teardown. For the most part page table sharing depends on i_mmap_mutex. In some cases, it is also taking the mm->page_table_lock for the PTE updates but with shared page tables, it is the i_mmap_mutex that is more important. Unfortunately it appears to be also insufficient. Consider the following situation Process A Process B --------- --------- hugetlb_fault shmdt LockWrite(mmap_sem) do_munmap unmap_region unmap_vmas unmap_single_vma unmap_hugepage_range Lock(i_mmap_mutex) Lock(mm->page_table_lock) huge_pmd_unshare/unmap tables <--- (1) Unlock(mm->page_table_lock) Unlock(i_mmap_mutex) huge_pte_alloc ... Lock(i_mmap_mutex) ... vma_prio_walk, find svma, spte ... Lock(mm->page_table_lock) ... share spte ... Unlock(mm->page_table_lock) ... Unlock(i_mmap_mutex) ... hugetlb_no_page <--- (2) free_pgtables unlink_file_vma hugetlb_free_pgd_range remove_vma_list In this scenario, it is possible for Process A to share page tables with Process B that is trying to tear them down. The i_mmap_mutex on its own does not prevent Process A walking Process B's page tables. At (1) above, the page tables are not shared yet so it unmaps the PMDs. Process A sets up page table sharing and at (2) faults a new entry. Process B then trips up on it in free_pgtables. This patch fixes the problem by adding a new function __unmap_hugepage_range_final that is only called when the VMA is about to be destroyed. This function clears VM_MAYSHARE during unmap_hugepage_range() under the i_mmap_mutex. This makes the VMA ineligible for sharing and avoids the race. Superficially this looks like it would then be vunerable to truncate and madvise issues but hugetlbfs has its own truncate handlers so does not use unmap_mapping_range() and does not support madvise(DONTNEED). This should be treated as a -stable candidate if it is merged. Test program is as follows. The test case was mostly written by Michal Hocko with a few minor changes to reproduce this bug. ==== CUT HERE ==== static size_t huge_page_size = (2UL << 20); static size_t nr_huge_page_A = 512; static size_t nr_huge_page_B = 5632; unsigned int get_random(unsigned int max) { struct timeval tv; gettimeofday(&tv, NULL); srandom(tv.tv_usec); return random() % max; } static void play(void *addr, size_t size) { unsigned char *start = addr, *end = start + size, *a; start += get_random(size/2); /* we could itterate on huge pages but let's give it more time. */ for (a = start; a < end; a += 4096) *a = 0; } int main(int argc, char **argv) { key_t key = IPC_PRIVATE; size_t sizeA = nr_huge_page_A * huge_page_size; size_t sizeB = nr_huge_page_B * huge_page_size; int shmidA, shmidB; void *addrA = NULL, *addrB = NULL; int nr_children = 300, n = 0; if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) { perror("shmget:"); return 1; } if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) { perror("shmat"); return 1; } if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) { perror("shmget:"); return 1; } if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) { perror("shmat"); return 1; } fork_child: switch(fork()) { case 0: switch (n%3) { case 0: play(addrA, sizeA); break; case 1: play(addrB, sizeB); break; case 2: break; } break; case -1: perror("fork:"); break; default: if (++n < nr_children) goto fork_child; play(addrA, sizeA); break; } shmdt(addrA); shmdt(addrB); do { wait(NULL); } while (--n > 0); shmctl(shmidA, IPC_RMID, NULL); shmctl(shmidB, IPC_RMID, NULL); return 0; } [akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build] Signed-off-by: Hugh Dickins Reviewed-by: Michal Hocko Signed-off-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'mm/hugetlb.c') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c39e4beeb63a..bc727122dd44 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2429,6 +2429,25 @@ again: tlb_end_vma(tlb, vma); } +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + __unmap_hugepage_range(tlb, vma, start, end, ref_page); + + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_mutex. This works + * because in the context this is called, the VMA is about to be + * destroyed and the i_mmap_mutex is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; +} + void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -3012,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - + /* + * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_mutex, another task can do the final put_page + * and that page table be reused and filled with junk. + */ flush_tlb_range(vma, start, end); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } int hugetlb_reserve_pages(struct inode *inode, -- cgit v1.2.3