summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c2
-rw-r--r--mm/huge_memory.c59
-rw-r--r--mm/hugetlb.c1
-rw-r--r--mm/internal.h27
-rw-r--r--mm/memcontrol.c29
-rw-r--r--mm/mempolicy.c17
-rw-r--r--mm/page_alloc.c21
-rw-r--r--mm/readahead.c11
-rw-r--r--mm/slab_common.c2
9 files changed, 108 insertions, 61 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 5e9359e4ff9e..2c308413387f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1957,8 +1957,6 @@ no_page:
gfp_t alloc_gfp = gfp;
err = -ENOMEM;
- if (order == 1)
- order = 0;
if (order > 0)
alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
folio = filemap_alloc_folio(alloc_gfp, order);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7b4cb5c68b61..635f0f0f6860 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -569,8 +569,8 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio)
void folio_prep_large_rmappable(struct folio *folio)
{
- VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
- INIT_LIST_HEAD(&folio->_deferred_list);
+ if (!folio || !folio_test_large(folio))
+ return;
folio_set_large_rmappable(folio);
}
@@ -2720,9 +2720,10 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
if (folio_ref_freeze(folio, 1 + extra_pins)) {
- if (!list_empty(&folio->_deferred_list)) {
+ if (folio_order(folio) > 1 &&
+ !list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
+ list_del_init(&folio->_deferred_list);
}
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
@@ -2766,26 +2767,38 @@ out:
return ret;
}
-void folio_undo_large_rmappable(struct folio *folio)
+/*
+ * __folio_unqueue_deferred_split() is not to be called directly:
+ * the folio_unqueue_deferred_split() inline wrapper in mm/internal.h
+ * limits its calls to those folios which may have a _deferred_list for
+ * queueing THP splits, and that list is (racily observed to be) non-empty.
+ *
+ * It is unsafe to call folio_unqueue_deferred_split() until folio refcount is
+ * zero: because even when split_queue_lock is held, a non-empty _deferred_list
+ * might be in use on deferred_split_scan()'s unlocked on-stack list.
+ *
+ * If memory cgroups are enabled, split_queue_lock is in the mem_cgroup: it is
+ * therefore important to unqueue deferred split before changing folio memcg.
+ */
+bool __folio_unqueue_deferred_split(struct folio *folio)
{
struct deferred_split *ds_queue;
unsigned long flags;
+ bool unqueued = false;
- /*
- * At this point, there is no one trying to add the folio to
- * deferred_list. If folio is not in deferred_list, it's safe
- * to check without acquiring the split_queue_lock.
- */
- if (data_race(list_empty(&folio->_deferred_list)))
- return;
+ WARN_ON_ONCE(folio_ref_count(folio));
+ WARN_ON_ONCE(!mem_cgroup_disabled() && !folio_memcg(folio));
ds_queue = get_deferred_split_queue(folio);
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
- list_del(&folio->_deferred_list);
+ list_del_init(&folio->_deferred_list);
+ unqueued = true;
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
+
+ return unqueued; /* useful for debug warnings */
}
void deferred_split_folio(struct folio *folio)
@@ -2796,17 +2809,19 @@ void deferred_split_folio(struct folio *folio)
#endif
unsigned long flags;
- VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio);
+ /*
+ * Order 1 folios have no space for a deferred list, but we also
+ * won't waste much memory by not adding them to the deferred list.
+ */
+ if (folio_order(folio) <= 1)
+ return;
/*
- * The try_to_unmap() in page reclaim path might reach here too,
- * this may cause a race condition to corrupt deferred split queue.
- * And, if page reclaim is already handling the same folio, it is
- * unnecessary to handle it again in shrinker.
- *
- * Check the swapcache flag to determine if the folio is being
- * handled by page reclaim since THP swap would add the folio into
- * swap cache before calling try_to_unmap().
+ * Exclude swapcache: originally to avoid a corrupt deferred split
+ * queue. Nowadays that is fully prevented by mem_cgroup_swapout();
+ * but if page reclaim is already handling the same folio, it is
+ * unnecessary to handle it again in the shrinker, so excluding
+ * swapcache here may still be a useful optimization.
*/
if (folio_test_swapcache(folio))
return;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0acb04c3e952..92b955cc5a41 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1795,6 +1795,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
destroy_compound_gigantic_folio(folio, huge_page_order(h));
free_gigantic_folio(folio, huge_page_order(h));
} else {
+ INIT_LIST_HEAD(&folio->_deferred_list);
__free_pages(&folio->page, huge_page_order(h));
}
}
diff --git a/mm/internal.h b/mm/internal.h
index ef8d787a510c..b30907537801 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -413,7 +413,30 @@ static inline void folio_set_order(struct folio *folio, unsigned int order)
#endif
}
-void folio_undo_large_rmappable(struct folio *folio);
+bool __folio_unqueue_deferred_split(struct folio *folio);
+static inline bool folio_unqueue_deferred_split(struct folio *folio)
+{
+ if (folio_order(folio) <= 1 || !folio_test_large_rmappable(folio))
+ return false;
+
+ /*
+ * At this point, there is no one trying to add the folio to
+ * deferred_list. If folio is not in deferred_list, it's safe
+ * to check without acquiring the split_queue_lock.
+ */
+ if (data_race(list_empty(&folio->_deferred_list)))
+ return false;
+
+ return __folio_unqueue_deferred_split(folio);
+}
+
+static inline struct folio *page_rmappable_folio(struct page *page)
+{
+ struct folio *folio = (struct folio *)page;
+
+ folio_prep_large_rmappable(folio);
+ return folio;
+}
static inline void prep_compound_head(struct page *page, unsigned int order)
{
@@ -423,6 +446,8 @@ static inline void prep_compound_head(struct page *page, unsigned int order)
atomic_set(&folio->_entire_mapcount, -1);
atomic_set(&folio->_nr_pages_mapped, 0);
atomic_set(&folio->_pincount, 0);
+ if (order > 1)
+ INIT_LIST_HEAD(&folio->_deferred_list);
}
static inline void prep_compound_tail(struct page *head, int tail_idx)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 110afda740a1..d2ceadd11b10 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5873,6 +5873,8 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
+ /* Warning should never happen, so don't worry about refcount non-0 */
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = (unsigned long)to;
__folio_memcg_unlock(from);
@@ -6237,7 +6239,10 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
enum mc_target_type target_type;
union mc_target target;
struct page *page;
+ struct folio *folio;
+ bool tried_split_before = false;
+retry_pmd:
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (mc.precharge < HPAGE_PMD_NR) {
@@ -6247,6 +6252,28 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
if (target_type == MC_TARGET_PAGE) {
page = target.page;
+ folio = page_folio(page);
+ /*
+ * Deferred split queue locking depends on memcg,
+ * and unqueue is unsafe unless folio refcount is 0:
+ * split or skip if on the queue? first try to split.
+ */
+ if (!list_empty(&folio->_deferred_list)) {
+ spin_unlock(ptl);
+ if (!tried_split_before)
+ split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
+ if (tried_split_before)
+ return 0;
+ tried_split_before = true;
+ goto retry_pmd;
+ }
+ /*
+ * So long as that pmd lock is held, the folio cannot
+ * be racily added to the _deferred_list, because
+ * page_remove_rmap() will find it still pmdmapped.
+ */
if (isolate_lru_page(page)) {
if (!mem_cgroup_move_account(page, true,
mc.from, mc.to)) {
@@ -7199,6 +7226,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
ug->nr_memory += nr_pages;
ug->pgpgout++;
+ WARN_ON_ONCE(folio_unqueue_deferred_split(folio));
folio->memcg_data = 0;
}
@@ -7492,6 +7520,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
VM_BUG_ON_FOLIO(oldid, folio);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
+ folio_unqueue_deferred_split(folio);
folio->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cae854c0f28..109826a2af38 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2200,10 +2200,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
mpol_cond_put(pol);
gfp |= __GFP_COMP;
page = alloc_page_interleave(gfp, order, nid);
- folio = (struct folio *)page;
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- goto out;
+ return page_rmappable_folio(page);
}
if (pol->mode == MPOL_PREFERRED_MANY) {
@@ -2213,10 +2210,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
gfp |= __GFP_COMP;
page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
- folio = (struct folio *)page;
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- goto out;
+ return page_rmappable_folio(page);
}
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
@@ -2310,12 +2304,7 @@ EXPORT_SYMBOL(alloc_pages);
struct folio *folio_alloc(gfp_t gfp, unsigned order)
{
- struct page *page = alloc_pages(gfp | __GFP_COMP, order);
- struct folio *folio = (struct folio *)page;
-
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- return folio;
+ return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order));
}
EXPORT_SYMBOL(folio_alloc);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1bbbf2f8b7e4..7272a922b838 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -600,9 +600,7 @@ void destroy_large_folio(struct folio *folio)
return;
}
- if (folio_test_large_rmappable(folio))
- folio_undo_large_rmappable(folio);
-
+ folio_unqueue_deferred_split(folio);
mem_cgroup_uncharge(folio);
free_the_page(&folio->page, folio_order(folio));
}
@@ -1002,10 +1000,11 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
}
break;
case 2:
- /*
- * the second tail page: ->mapping is
- * deferred_list.next -- ignore value.
- */
+ /* the second tail page: deferred_list overlaps ->mapping */
+ if (unlikely(!list_empty(&folio->_deferred_list))) {
+ bad_page(page, "on deferred list");
+ goto out;
+ }
break;
default:
if (page->mapping != TAIL_MAPPING) {
@@ -4464,12 +4463,8 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
- preferred_nid, nodemask);
- struct folio *folio = (struct folio *)page;
-
- if (folio && order > 1)
- folio_prep_large_rmappable(folio);
- return folio;
+ preferred_nid, nodemask);
+ return page_rmappable_folio(page);
}
EXPORT_SYMBOL(__folio_alloc);
diff --git a/mm/readahead.c b/mm/readahead.c
index 7c0449f8bec7..e9b11d928b0c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -514,16 +514,11 @@ void page_cache_ra_order(struct readahead_control *ractl,
unsigned int order = new_order;
/* Align with smaller pages if needed */
- if (index & ((1UL << order) - 1)) {
+ if (index & ((1UL << order) - 1))
order = __ffs(index);
- if (order == 1)
- order = 0;
- }
/* Don't allocate pages past EOF */
- while (index + (1UL << order) - 1 > limit) {
- if (--order == 1)
- order = 0;
- }
+ while (index + (1UL << order) - 1 > limit)
+ order--;
err = ra_alloc_folio(ractl, index, mark, order, gfp);
if (err)
break;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index ef971fcdaa07..2e2b43fae2c3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1391,7 +1391,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
/* Zero out spare memory. */
if (want_init_on_alloc(flags)) {
kasan_disable_current();
- memset((void *)p + new_size, 0, ks - new_size);
+ memset(kasan_reset_tag(p) + new_size, 0, ks - new_size);
kasan_enable_current();
}