diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 12 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/debug.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 6 | ||||
-rw-r--r-- | mm/page_ext.c | 4 | ||||
-rw-r--r-- | mm/page_idle.c | 232 | ||||
-rw-r--r-- | mm/rmap.c | 6 | ||||
-rw-r--r-- | mm/swap.c | 3 |
9 files changed, 278 insertions, 2 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index 3a4070f5ab79..6413d027c0b2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,6 +649,18 @@ config DEFERRED_STRUCT_PAGE_INIT processes running early in the lifetime of the systemm until kswapd finishes the initialisation. +config IDLE_PAGE_TRACKING + bool "Enable idle page tracking" + depends on SYSFS && MMU + select PAGE_EXTENSION if !64BIT + help + This feature allows to estimate the amount of user pages that have + not been touched during a given period of time. This information can + be useful to tune memory cgroup limits and/or for job placement + within a compute cluster. + + See Documentation/vm/idle_page_tracking.txt for more details. + config ZONE_DEVICE bool "Device memory (pmem, etc...) hotplug support" if EXPERT default !ZONE_DMA diff --git a/mm/Makefile b/mm/Makefile index b424d5e5b6ff..56f8eed73f1a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -79,3 +79,4 @@ obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o diff --git a/mm/debug.c b/mm/debug.c index 76089ddf99ea..6c1b3ea61bfd 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -48,6 +48,10 @@ static const struct trace_print_flags pageflag_names[] = { #ifdef CONFIG_TRANSPARENT_HUGEPAGE {1UL << PG_compound_lock, "compound_lock" }, #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) + {1UL << PG_young, "young" }, + {1UL << PG_idle, "idle" }, +#endif }; static void dump_flags(unsigned long flags, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b16279cbd91d..4b06b8db9df2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -25,6 +25,7 @@ #include <linux/migrate.h> #include <linux/hashtable.h> #include <linux/userfaultfd_k.h> +#include <linux/page_idle.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -1757,6 +1758,11 @@ static void __split_huge_page_refcount(struct page *page, /* clear PageTail before overwriting first_page */ smp_wmb(); + if (page_is_young(page)) + set_page_young(page_tail); + if (page_is_idle(page)) + set_page_idle(page_tail); + /* * __split_huge_page_splitting() already set the * splitting bit in all pmd that could map this @@ -2262,7 +2268,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageLRU(page), page); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } @@ -2693,7 +2700,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, */ if (page_count(page) != 1 + !!PageSwapCache(page)) goto out_unmap; - if (pte_young(pteval) || PageReferenced(page) || + if (pte_young(pteval) || + page_is_young(page) || PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, address)) referenced = true; } diff --git a/mm/migrate.c b/mm/migrate.c index 02ce25df16c2..c3cb566af3e2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -37,6 +37,7 @@ #include <linux/gfp.h> #include <linux/balloon_compaction.h> #include <linux/mmu_notifier.h> +#include <linux/page_idle.h> #include <asm/tlbflush.h> @@ -524,6 +525,11 @@ void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + if (page_is_young(page)) + set_page_young(newpage); + if (page_is_idle(page)) + set_page_idle(newpage); + /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. diff --git a/mm/page_ext.c b/mm/page_ext.c index d86fd2f5353f..292ca7b8debd 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -6,6 +6,7 @@ #include <linux/vmalloc.h> #include <linux/kmemleak.h> #include <linux/page_owner.h> +#include <linux/page_idle.h> /* * struct page extension @@ -59,6 +60,9 @@ static struct page_ext_operations *page_ext_ops[] = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif +#if defined(CONFIG_IDLE_PAGE_TRACKING) && !defined(CONFIG_64BIT) + &page_idle_ops, +#endif }; static unsigned long total_usage; diff --git a/mm/page_idle.c b/mm/page_idle.c new file mode 100644 index 000000000000..d5dd79041484 --- /dev/null +++ b/mm/page_idle.c @@ -0,0 +1,232 @@ +#include <linux/init.h> +#include <linux/bootmem.h> +#include <linux/fs.h> +#include <linux/sysfs.h> +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/page_ext.h> +#include <linux/page_idle.h> + +#define BITMAP_CHUNK_SIZE sizeof(u64) +#define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) + +/* + * Idle page tracking only considers user memory pages, for other types of + * pages the idle flag is always unset and an attempt to set it is silently + * ignored. + * + * We treat a page as a user memory page if it is on an LRU list, because it is + * always safe to pass such a page to rmap_walk(), which is essential for idle + * page tracking. With such an indicator of user pages we can skip isolated + * pages, but since there are not usually many of them, it will hardly affect + * the overall result. + * + * This function tries to get a user memory page by pfn as described above. + */ +static struct page *page_idle_get_page(unsigned long pfn) +{ + struct page *page; + struct zone *zone; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (!page || !PageLRU(page) || + !get_page_unless_zero(page)) + return NULL; + + zone = page_zone(page); + spin_lock_irq(&zone->lru_lock); + if (unlikely(!PageLRU(page))) { + put_page(page); + page = NULL; + } + spin_unlock_irq(&zone->lru_lock); + return page; +} + +static int page_idle_clear_pte_refs_one(struct page *page, + struct vm_area_struct *vma, + unsigned long addr, void *arg) +{ + struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; + pmd_t *pmd; + pte_t *pte; + bool referenced = false; + + if (unlikely(PageTransHuge(page))) { + pmd = page_check_address_pmd(page, mm, addr, + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (pmd) { + referenced = pmdp_clear_young_notify(vma, addr, pmd); + spin_unlock(ptl); + } + } else { + pte = page_check_address(page, mm, addr, &ptl, 0); + if (pte) { + referenced = ptep_clear_young_notify(vma, addr, pte); + pte_unmap_unlock(pte, ptl); + } + } + if (referenced) { + clear_page_idle(page); + /* + * We cleared the referenced bit in a mapping to this page. To + * avoid interference with page reclaim, mark it young so that + * page_referenced() will return > 0. + */ + set_page_young(page); + } + return SWAP_AGAIN; +} + +static void page_idle_clear_pte_refs(struct page *page) +{ + /* + * Since rwc.arg is unused, rwc is effectively immutable, so we + * can make it static const to save some cycles and stack. + */ + static const struct rmap_walk_control rwc = { + .rmap_one = page_idle_clear_pte_refs_one, + .anon_lock = page_lock_anon_vma_read, + }; + bool need_lock; + + if (!page_mapped(page) || + !page_rmapping(page)) + return; + + need_lock = !PageAnon(page) || PageKsm(page); + if (need_lock && !trylock_page(page)) + return; + + rmap_walk(page, (struct rmap_walk_control *)&rwc); + + if (need_lock) + unlock_page(page); +} + +static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + u64 *out = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return 0; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if (!bit) + *out = 0ULL; + page = page_idle_get_page(pfn); + if (page) { + if (page_is_idle(page)) { + /* + * The page might have been referenced via a + * pte, in which case it is not idle. Clear + * refs and recheck. + */ + page_idle_clear_pte_refs(page); + if (page_is_idle(page)) + *out |= 1ULL << bit; + } + put_page(page); + } + if (bit == BITMAP_CHUNK_BITS - 1) + out++; + cond_resched(); + } + return (char *)out - buf; +} + +static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t pos, size_t count) +{ + const u64 *in = (u64 *)buf; + struct page *page; + unsigned long pfn, end_pfn; + int bit; + + if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE) + return -EINVAL; + + pfn = pos * BITS_PER_BYTE; + if (pfn >= max_pfn) + return -ENXIO; + + end_pfn = pfn + count * BITS_PER_BYTE; + if (end_pfn > max_pfn) + end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS); + + for (; pfn < end_pfn; pfn++) { + bit = pfn % BITMAP_CHUNK_BITS; + if ((*in >> bit) & 1) { + page = page_idle_get_page(pfn); + if (page) { + page_idle_clear_pte_refs(page); + set_page_idle(page); + put_page(page); + } + } + if (bit == BITMAP_CHUNK_BITS - 1) + in++; + cond_resched(); + } + return (char *)in - buf; +} + +static struct bin_attribute page_idle_bitmap_attr = + __BIN_ATTR(bitmap, S_IRUSR | S_IWUSR, + page_idle_bitmap_read, page_idle_bitmap_write, 0); + +static struct bin_attribute *page_idle_bin_attrs[] = { + &page_idle_bitmap_attr, + NULL, +}; + +static struct attribute_group page_idle_attr_group = { + .bin_attrs = page_idle_bin_attrs, + .name = "page_idle", +}; + +#ifndef CONFIG_64BIT +static bool need_page_idle(void) +{ + return true; +} +struct page_ext_operations page_idle_ops = { + .need = need_page_idle, +}; +#endif + +static int __init page_idle_init(void) +{ + int err; + + err = sysfs_create_group(mm_kobj, &page_idle_attr_group); + if (err) { + pr_err("page_idle: register sysfs failed\n"); + return err; + } + return 0; +} +subsys_initcall(page_idle_init); diff --git a/mm/rmap.c b/mm/rmap.c index 0db38e7d0a72..f5b5c1f3dcd7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -59,6 +59,7 @@ #include <linux/migrate.h> #include <linux/hugetlb.h> #include <linux/backing-dev.h> +#include <linux/page_idle.h> #include <asm/tlbflush.h> @@ -886,6 +887,11 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } + if (referenced) + clear_page_idle(page); + if (test_and_clear_page_young(page)) + referenced++; + if (referenced) { pra->referenced++; pra->vm_flags |= vma->vm_flags; diff --git a/mm/swap.c b/mm/swap.c index a3a0a2f1f7c3..983f692a47fd 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -32,6 +32,7 @@ #include <linux/gfp.h> #include <linux/uio.h> #include <linux/hugetlb.h> +#include <linux/page_idle.h> #include "internal.h" @@ -622,6 +623,8 @@ void mark_page_accessed(struct page *page) } else if (!PageReferenced(page)) { SetPageReferenced(page); } + if (page_is_idle(page)) + clear_page_idle(page); } EXPORT_SYMBOL(mark_page_accessed); |