diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 59 | ||||
-rw-r--r-- | mm/hugetlb.c | 1 | ||||
-rw-r--r-- | mm/internal.h | 27 | ||||
-rw-r--r-- | mm/memcontrol.c | 29 | ||||
-rw-r--r-- | mm/mempolicy.c | 17 | ||||
-rw-r--r-- | mm/page_alloc.c | 21 | ||||
-rw-r--r-- | mm/readahead.c | 11 | ||||
-rw-r--r-- | mm/slab_common.c | 2 |
9 files changed, 108 insertions, 61 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 5e9359e4ff9e..2c308413387f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1957,8 +1957,6 @@ no_page: gfp_t alloc_gfp = gfp; err = -ENOMEM; - if (order == 1) - order = 0; if (order > 0) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; folio = filemap_alloc_folio(alloc_gfp, order); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7b4cb5c68b61..635f0f0f6860 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -569,8 +569,8 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) void folio_prep_large_rmappable(struct folio *folio) { - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); - INIT_LIST_HEAD(&folio->_deferred_list); + if (!folio || !folio_test_large(folio)) + return; folio_set_large_rmappable(folio); } @@ -2720,9 +2720,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); if (folio_ref_freeze(folio, 1 + extra_pins)) { - if (!list_empty(&folio->_deferred_list)) { + if (folio_order(folio) > 1 && + !list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); + list_del_init(&folio->_deferred_list); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { @@ -2766,26 +2767,38 @@ out: return ret; } -void folio_undo_large_rmappable(struct folio *folio) +/* + * __folio_unqueue_deferred_split() is not to be called directly: + * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h + * limits its calls to those folios which may have a _deferred_list for + * queueing THP splits, and that list is (racily observed to be) non-empty. + * + * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is + * zero: because even when split_queue_lock is held, a non-empty _deferred_list + * might be in use on deferred_split_scan()'s unlocked on-stack list. + * + * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is + * therefore important to unqueue deferred split before changing folio memcg. + */ +bool __folio_unqueue_deferred_split(struct folio *folio) { struct deferred_split *ds_queue; unsigned long flags; + bool unqueued = false; - /* - * At this point, there is no one trying to add the folio to - * deferred_list. If folio is not in deferred_list, it's safe - * to check without acquiring the split_queue_lock. - */ - if (data_race(list_empty(&folio->_deferred_list))) - return; + WARN_ON_ONCE(folio_ref_count(folio)); + WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio)); ds_queue = get_deferred_split_queue(folio); spin_lock_irqsave(&ds_queue->split_queue_lock, flags); if (!list_empty(&folio->_deferred_list)) { ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); + list_del_init(&folio->_deferred_list); + unqueued = true; } spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + + return unqueued; /* useful for debug warnings */ } void deferred_split_folio(struct folio *folio) @@ -2796,17 +2809,19 @@ void deferred_split_folio(struct folio *folio) #endif unsigned long flags; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); + /* + * Order 1 folios have no space for a deferred list, but we also + * won't waste much memory by not adding them to the deferred list. + */ + if (folio_order(folio) <= 1) + return; /* - * The try_to_unmap() in page reclaim path might reach here too, - * this may cause a race condition to corrupt deferred split queue. - * And, if page reclaim is already handling the same folio, it is - * unnecessary to handle it again in shrinker. - * - * Check the swapcache flag to determine if the folio is being - * handled by page reclaim since THP swap would add the folio into - * swap cache before calling try_to_unmap(). + * Exclude swapcache: originally to avoid a corrupt deferred split + * queue. Nowadays that is fully prevented by mem_cgroup_swapout(); + * but if page reclaim is already handling the same folio, it is + * unnecessary to handle it again in the shrinker, so excluding + * swapcache here may still be a useful optimization. */ if (folio_test_swapcache(folio)) return; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0acb04c3e952..92b955cc5a41 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1795,6 +1795,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, destroy_compound_gigantic_folio(folio, huge_page_order(h)); free_gigantic_folio(folio, huge_page_order(h)); } else { + INIT_LIST_HEAD(&folio->_deferred_list); __free_pages(&folio->page, huge_page_order(h)); } } diff --git a/mm/internal.h b/mm/internal.h index ef8d787a510c..b30907537801 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -413,7 +413,30 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } -void folio_undo_large_rmappable(struct folio *folio); +bool __folio_unqueue_deferred_split(struct folio *folio); +static inline bool folio_unqueue_deferred_split(struct folio *folio) +{ + if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio)) + return false; + + /* + * At this point, there is no one trying to add the folio to + * deferred_list. If folio is not in deferred_list, it's safe + * to check without acquiring the split_queue_lock. + */ + if (data_race(list_empty(&folio->_deferred_list))) + return false; + + return __folio_unqueue_deferred_split(folio); +} + +static inline struct folio *page_rmappable_folio(struct page *page) +{ + struct folio *folio = (struct folio *)page; + + folio_prep_large_rmappable(folio); + return folio; +} static inline void prep_compound_head(struct page *page, unsigned int order) { @@ -423,6 +446,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order) atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); atomic_set(&folio->_pincount, 0); + if (order > 1) + INIT_LIST_HEAD(&folio->_deferred_list); } static inline void prep_compound_tail(struct page *head, int tail_idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 110afda740a1..d2ceadd11b10 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5873,6 +5873,8 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); + /* Warning should never happen, so don't worry about refcount non-0 */ + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = (unsigned long)to; __folio_memcg_unlock(from); @@ -6237,7 +6239,10 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, enum mc_target_type target_type; union mc_target target; struct page *page; + struct folio *folio; + bool tried_split_before = false; +retry_pmd: ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { @@ -6247,6 +6252,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); if (target_type == MC_TARGET_PAGE) { page = target.page; + folio = page_folio(page); + /* + * Deferred split queue locking depends on memcg, + * and unqueue is unsafe unless folio refcount is 0: + * split or skip if on the queue? first try to split. + */ + if (!list_empty(&folio->_deferred_list)) { + spin_unlock(ptl); + if (!tried_split_before) + split_folio(folio); + folio_unlock(folio); + folio_put(folio); + if (tried_split_before) + return 0; + tried_split_before = true; + goto retry_pmd; + } + /* + * So long as that pmd lock is held, the folio cannot + * be racily added to the _deferred_list, because + * page_remove_rmap() will find it still pmdmapped. + */ if (isolate_lru_page(page)) { if (!mem_cgroup_move_account(page, true, mc.from, mc.to)) { @@ -7199,6 +7226,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ug->nr_memory += nr_pages; ug->pgpgout++; + WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); folio->memcg_data = 0; } @@ -7492,6 +7520,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) VM_BUG_ON_FOLIO(oldid, folio); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); + folio_unqueue_deferred_split(folio); folio->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4cae854c0f28..109826a2af38 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2200,10 +2200,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); - folio = (struct folio *)page; - if (folio && order > 1) - folio_prep_large_rmappable(folio); - goto out; + return page_rmappable_folio(page); } if (pol->mode == MPOL_PREFERRED_MANY) { @@ -2213,10 +2210,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, gfp |= __GFP_COMP; page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); - folio = (struct folio *)page; - if (folio && order > 1) - folio_prep_large_rmappable(folio); - goto out; + return page_rmappable_folio(page); } if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { @@ -2310,12 +2304,7 @@ EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); - struct folio *folio = (struct folio *)page; - - if (folio && order > 1) - folio_prep_large_rmappable(folio); - return folio; + return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } EXPORT_SYMBOL(folio_alloc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1bbbf2f8b7e4..7272a922b838 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -600,9 +600,7 @@ void destroy_large_folio(struct folio *folio) return; } - if (folio_test_large_rmappable(folio)) - folio_undo_large_rmappable(folio); - + folio_unqueue_deferred_split(folio); mem_cgroup_uncharge(folio); free_the_page(&folio->page, folio_order(folio)); } @@ -1002,10 +1000,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page) } break; case 2: - /* - * the second tail page: ->mapping is - * deferred_list.next -- ignore value. - */ + /* the second tail page: deferred_list overlaps ->mapping */ + if (unlikely(!list_empty(&folio->_deferred_list))) { + bad_page(page, "on deferred list"); + goto out; + } break; default: if (page->mapping != TAIL_MAPPING) { @@ -4464,12 +4463,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { struct page *page = __alloc_pages(gfp | __GFP_COMP, order, - preferred_nid, nodemask); - struct folio *folio = (struct folio *)page; - - if (folio && order > 1) - folio_prep_large_rmappable(folio); - return folio; + preferred_nid, nodemask); + return page_rmappable_folio(page); } EXPORT_SYMBOL(__folio_alloc); diff --git a/mm/readahead.c b/mm/readahead.c index 7c0449f8bec7..e9b11d928b0c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -514,16 +514,11 @@ void page_cache_ra_order(struct readahead_control *ractl, unsigned int order = new_order; /* Align with smaller pages if needed */ - if (index & ((1UL << order) - 1)) { + if (index & ((1UL << order) - 1)) order = __ffs(index); - if (order == 1) - order = 0; - } /* Don't allocate pages past EOF */ - while (index + (1UL << order) - 1 > limit) { - if (--order == 1) - order = 0; - } + while (index + (1UL << order) - 1 > limit) + order--; err = ra_alloc_folio(ractl, index, mark, order, gfp); if (err) break; diff --git a/mm/slab_common.c b/mm/slab_common.c index ef971fcdaa07..2e2b43fae2c3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1391,7 +1391,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) /* Zero out spare memory. */ if (want_init_on_alloc(flags)) { kasan_disable_current(); - memset((void *)p + new_size, 0, ks - new_size); + memset(kasan_reset_tag(p) + new_size, 0, ks - new_size); kasan_enable_current(); } |