diff options
-rw-r--r-- | include/linux/huge_mm.h | 5 | ||||
-rw-r--r-- | include/linux/mm.h | 5 | ||||
-rw-r--r-- | include/linux/mm_types.h | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 139 | ||||
-rw-r--r-- | mm/migrate.c | 1 | ||||
-rw-r--r-- | mm/page_alloc.c | 27 | ||||
-rw-r--r-- | mm/rmap.c | 7 |
7 files changed, 174 insertions, 12 deletions
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 90e11e6a37ab..7aec5ee9cfdf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -90,11 +90,15 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); extern unsigned long transparent_hugepage_flags; +extern void prep_transhuge_page(struct page *page); +extern void free_transhuge_page(struct page *page); + int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } +void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address); @@ -170,6 +174,7 @@ static inline int split_huge_page(struct page *page) { return 0; } +static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, diff --git a/include/linux/mm.h b/include/linux/mm.h index e4397f640e86..aa8ae8330a75 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -508,6 +508,9 @@ enum compound_dtor_id { #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + TRANSHUGE_PAGE_DTOR, +#endif NR_COMPOUND_DTORS, }; extern compound_page_dtor * const compound_page_dtors[]; @@ -537,6 +540,8 @@ static inline void set_compound_order(struct page *page, unsigned int order) page[1].compound_order = order; } +void free_compound_page(struct page *page); + #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 809defe0597d..2dd9c313a8c0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -55,6 +55,7 @@ struct page { */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ + /* page_deferred_list().next -- second tail page */ }; /* Second double word */ @@ -62,6 +63,7 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ + /* page_deferred_list().prev -- second tail page */ }; union { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6ac6c43d6a4..4acf55b31f7c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -135,6 +135,10 @@ static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; +static DEFINE_SPINLOCK(split_queue_lock); +static LIST_HEAD(split_queue); +static unsigned long split_queue_len; +static struct shrinker deferred_split_shrinker; static void set_recommended_min_free_kbytes(void) { @@ -667,6 +671,9 @@ static int __init hugepage_init(void) err = register_shrinker(&huge_zero_page_shrinker); if (err) goto err_hzp_shrinker; + err = register_shrinker(&deferred_split_shrinker); + if (err) + goto err_split_shrinker; /* * By default disable transparent hugepages on smaller systems, @@ -684,6 +691,8 @@ static int __init hugepage_init(void) return 0; err_khugepaged: + unregister_shrinker(&deferred_split_shrinker); +err_split_shrinker: unregister_shrinker(&huge_zero_page_shrinker); err_hzp_shrinker: khugepaged_slab_exit(); @@ -740,6 +749,27 @@ static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) return entry; } +static inline struct list_head *page_deferred_list(struct page *page) +{ + /* + * ->lru in the tail pages is occupied by compound_head. + * Let's use ->mapping + ->index in the second tail page as list_head. + */ + return (struct list_head *)&page[2].mapping; +} + +void prep_transhuge_page(struct page *page) +{ + /* + * we use page->mapping and page->indexlru in second tail page + * as list_head: assuming THP order >= 2 + */ + BUILD_BUG_ON(HPAGE_PMD_ORDER < 2); + + INIT_LIST_HEAD(page_deferred_list(page)); + set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR); +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -896,6 +926,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } + prep_transhuge_page(page); return __do_huge_pmd_anonymous_page(mm, vma, address, pmd, page, gfp, flags); } @@ -1192,7 +1223,9 @@ alloc: } else new_page = NULL; - if (unlikely(!new_page)) { + if (likely(new_page)) { + prep_transhuge_page(new_page); + } else { if (!page) { split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; @@ -2109,6 +2142,7 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, struct mm_struct *mm, return NULL; } + prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return *hpage; } @@ -2120,8 +2154,12 @@ static int khugepaged_find_target_node(void) static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); + struct page *page; + + page = alloc_pages(alloc_hugepage_gfpmask(defrag, 0), HPAGE_PMD_ORDER); + if (page) + prep_transhuge_page(page); + return page; } static struct page *khugepaged_alloc_hugepage(bool *wait) @@ -3098,7 +3136,7 @@ static int __split_huge_page_tail(struct page *head, int tail, set_page_idle(page_tail); /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail != 1 && page_tail->mapping != TAIL_MAPPING, + VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); page_tail->mapping = head->mapping; @@ -3207,12 +3245,20 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) freeze_page(anon_vma, head); VM_BUG_ON_PAGE(compound_mapcount(head), head); + /* Prevent deferred_split_scan() touching ->_count */ + spin_lock(&split_queue_lock); count = page_count(head); mapcount = total_mapcount(head); if (mapcount == count - 1) { + if (!list_empty(page_deferred_list(head))) { + split_queue_len--; + list_del(page_deferred_list(head)); + } + spin_unlock(&split_queue_lock); __split_huge_page(page, list); ret = 0; } else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount > count - 1) { + spin_unlock(&split_queue_lock); pr_alert("total_mapcount: %u, page_count(): %u\n", mapcount, count); if (PageTail(page)) @@ -3220,6 +3266,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) dump_page(page, "total_mapcount(head) > page_count(head) - 1"); BUG(); } else { + spin_unlock(&split_queue_lock); unfreeze_page(anon_vma, head); ret = -EBUSY; } @@ -3231,3 +3278,87 @@ out: count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); return ret; } + +void free_transhuge_page(struct page *page) +{ + unsigned long flags; + + spin_lock_irqsave(&split_queue_lock, flags); + if (!list_empty(page_deferred_list(page))) { + split_queue_len--; + list_del(page_deferred_list(page)); + } + spin_unlock_irqrestore(&split_queue_lock, flags); + free_compound_page(page); +} + +void deferred_split_huge_page(struct page *page) +{ + unsigned long flags; + + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + + spin_lock_irqsave(&split_queue_lock, flags); + if (list_empty(page_deferred_list(page))) { + list_add_tail(page_deferred_list(page), &split_queue); + split_queue_len++; + } + spin_unlock_irqrestore(&split_queue_lock, flags); +} + +static unsigned long deferred_split_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + /* + * Split a page from split_queue will free up at least one page, + * at most HPAGE_PMD_NR - 1. We don't track exact number. + * Let's use HPAGE_PMD_NR / 2 as ballpark. + */ + return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2; +} + +static unsigned long deferred_split_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + unsigned long flags; + LIST_HEAD(list), *pos, *next; + struct page *page; + int split = 0; + + spin_lock_irqsave(&split_queue_lock, flags); + list_splice_init(&split_queue, &list); + + /* Take pin on all head pages to avoid freeing them under us */ + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, mapping); + page = compound_head(page); + /* race with put_compound_page() */ + if (!get_page_unless_zero(page)) { + list_del_init(page_deferred_list(page)); + split_queue_len--; + } + } + spin_unlock_irqrestore(&split_queue_lock, flags); + + list_for_each_safe(pos, next, &list) { + page = list_entry((void *)pos, struct page, mapping); + lock_page(page); + /* split_huge_page() removes page from list on success */ + if (!split_huge_page(page)) + split++; + unlock_page(page); + put_page(page); + } + + spin_lock_irqsave(&split_queue_lock, flags); + list_splice_tail(&list, &split_queue); + spin_unlock_irqrestore(&split_queue_lock, flags); + + return split * HPAGE_PMD_NR / 2; +} + +static struct shrinker deferred_split_shrinker = { + .count_objects = deferred_split_count, + .scan_objects = deferred_split_scan, + .seeks = DEFAULT_SEEKS, +}; diff --git a/mm/migrate.c b/mm/migrate.c index dec81a9e2fd6..b1034f9c77e7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1760,6 +1760,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, HPAGE_PMD_ORDER); if (!new_page) goto out_fail; + prep_transhuge_page(new_page); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3221091da513..25409714160e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -222,13 +222,15 @@ static char * const zone_names[MAX_NR_ZONES] = { #endif }; -static void free_compound_page(struct page *page); compound_page_dtor * const compound_page_dtors[] = { NULL, free_compound_page, #ifdef CONFIG_HUGETLB_PAGE free_huge_page, #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + free_transhuge_page, +#endif }; int min_free_kbytes = 1024; @@ -450,7 +452,7 @@ out: * This usage means that zero-order pages may not be compound. */ -static void free_compound_page(struct page *page) +void free_compound_page(struct page *page) { __free_pages_ok(page, compound_order(page)); } @@ -858,15 +860,26 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) ret = 0; goto out; } - /* mapping in first tail page is used for compound_mapcount() */ - if (page - head_page == 1) { + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; } - } else if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); - goto out; + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; } if (unlikely(!PageTail(page))) { bad_page(page, "PageTail not set", 0); diff --git a/mm/rmap.c b/mm/rmap.c index fc707df92ede..84271cc39d1e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1282,8 +1282,10 @@ static void page_remove_anon_compound_rmap(struct page *page) nr = HPAGE_PMD_NR; } - if (nr) + if (nr) { __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, -nr); + deferred_split_huge_page(page); + } } /** @@ -1318,6 +1320,9 @@ void page_remove_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page); + if (PageTransCompound(page)) + deferred_split_huge_page(compound_head(page)); + /* * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap |